You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@uima.apache.org by sc...@apache.org on 2017/09/01 14:08:41 UTC
svn commit: r1806964 -
/uima/uv3/uimaj-v3/trunk/uimaj-core/src/test/java/org/apache/uima/cas/impl/XmiCompare.java
Author: schor
Date: Fri Sep 1 14:08:41 2017
New Revision: 1806964
URL: http://svn.apache.org/viewvc?rev=1806964&view=rev
Log:
no Jira - cleanup XmiCompare for general usage, improve javadocs
Modified:
uima/uv3/uimaj-v3/trunk/uimaj-core/src/test/java/org/apache/uima/cas/impl/XmiCompare.java
Modified: uima/uv3/uimaj-v3/trunk/uimaj-core/src/test/java/org/apache/uima/cas/impl/XmiCompare.java
URL: http://svn.apache.org/viewvc/uima/uv3/uimaj-v3/trunk/uimaj-core/src/test/java/org/apache/uima/cas/impl/XmiCompare.java?rev=1806964&r1=1806963&r2=1806964&view=diff
==============================================================================
--- uima/uv3/uimaj-v3/trunk/uimaj-core/src/test/java/org/apache/uima/cas/impl/XmiCompare.java (original)
+++ uima/uv3/uimaj-v3/trunk/uimaj-core/src/test/java/org/apache/uima/cas/impl/XmiCompare.java Fri Sep 1 14:08:41 2017
@@ -27,13 +27,22 @@ import java.nio.file.FileVisitOption;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
+import java.util.ArrayList;
+import java.util.Arrays;
import java.util.Iterator;
+import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.uima.UIMAFramework;
-import org.apache.uima.cas.FeatureStructure;
+import org.apache.uima.cas.Feature;
+import org.apache.uima.cas.Type;
+import org.apache.uima.cas.TypeSystem;
+import org.apache.uima.cas.text.AnnotationFS;
import org.apache.uima.internal.util.Misc;
+import org.apache.uima.jcas.cas.FSArray;
+import org.apache.uima.jcas.cas.StringArray;
+import org.apache.uima.jcas.cas.TOP;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.resource.metadata.TypeSystemDescription;
import org.apache.uima.util.CasCreationUtils;
@@ -49,30 +58,42 @@ import org.apache.uima.util.XMLInputSour
* It takes two directories, with the xmi CASs to compare (must be named the same):
* each directory has 1 special file named: CAS_typeSystem_desc.xml - this is the type system
* each has n other files: xmi CASes to compare
+ * It takes an int "skip" argument, to skip over that many CASs before starting comparison.
*
- * Operation:
- * Prepare two CASs with the type system specified.
- * Iterate over one of the set of xmi's:
- * find the other corresponding xmi by name match
- * load both
- * compare the two cas's
- * print the results
+ * It supports comparing results of UIMA V2 with V3, using a convention:
+ * The input directories must have names that start with uv2-out or uv3-out
*
* Compare technique:
* Get a set of roots - the items that are in any index
* Sort that by type, and then by content.
*
- * Compare: to compare FSRefs, follow the refs (but track, so don't get into loop)
- * -- set to compare all elements of arrays
+ * There are 3 sets of compare-relaxers.
+ * - uv2 to uv2
+ * - uv3 to uv3
+ * - uv2 to uv3
+ *
*/
public class XmiCompare {
private static final String BLANKS_89 = Misc.blanks.substring(0, 89);
Path d1, d2; // the input directories
- String d2String;
+ private String d2String;
+ private String d1String;
+
+ boolean isV2V2;
+ boolean isV3V3;
+ boolean isV2V3;
+
CASImpl c1, c2;
boolean isOk = true;
+
+ private int itemCount = 0;
+
+ private CasCompare cc;
+
+ private int skip;
+
public static void main(String[] args) {
@@ -81,21 +102,37 @@ public class XmiCompare {
void run(String[] args) {
try {
+ itemCount = 0;
// alternative to supplying args- hard code the path :-)
if (args == null || args.length == 0) {
-// d1 = Paths.get("C:/au/t/uimaj/comparev2v3watsonx/med_nlp_with_20_notes_v2_afterFix1/out");
- d1 = Paths.get("C:/au/t/uimaj/comparev2v3watsonx/uimav3test_with_20_notes/out");
-// d1 = Paths.get("C:/au/t/uimaj/comparev2v3watsonx/med_nlp_with_20_notes_v2_2/out");
-// d2 = Paths.get("C:/au/t/uimaj/comparev2v3watsonx/med_nlp_with_20_notes_v2/out");
-// d2 = Paths.get("C:/au/t/uimaj/comparev2v3watsonx/med_nlp_with_20_notes_v2_2a/out");
- d2 = Paths.get("C:/au/t/uimaj/comparev2v3watsonx/med_nlp_with_20_notes_v2_afterFix/out");
+ d1 = Paths.get("some-explicit-coded-path/uv2-out-some-suffix");
+ d2 = Paths.get("some-explicit-coded-path/uv2-out-some-other-suffix");
+
+
+// skip = 725; // optional skip amount
} else {
d1 = Paths.get(args[0]);
d2 = Paths.get(args[1]);
+ skip = Integer.parseInt(args[2]);
}
- System.out.println("Comparing " + d1 + " to " + d2);
+ d1String = d1.toString();
d2String = d2.toString();
+
+ boolean d1v2 = d1String.contains("uv2-out");
+ boolean d2v2 = d2String.contains("uv2-out");
+ boolean d1v3 = d1String.contains("uv3-out");
+ boolean d2v3 = d2String.contains("uv3-out");
+
+ isV2V2 = d1v2 && d2v2;
+ isV3V3 = d1v3 && d2v3;
+ isV2V3 = (d1v2 && d2v3) || (d1v3 && d2v2);
+
+ System.out.println("Comparing " + d1String + " to " + d2String);
+ if (isV2V2) System.out.println("\napplying fixups for v2 versus v2 comparison");
+ if (isV2V3) System.out.println("\napplying fixups for v2 versus v3 comparison");
+ if (isV3V3) System.out.println("\napplying fixups for v3 versus v3 comparison");
+
// read the type system descriptor
File typeSystemFile = Paths.get(d2String, "CAS_typeSystem_desc.xml").toFile();
TypeSystemDescription typeSystemDescription = UIMAFramework.getXMLParser().parseTypeSystemDescription(
@@ -132,6 +169,17 @@ public class XmiCompare {
if (!p2.toFile().exists()) {
return;
}
+
+ itemCount++;
+
+ System.out.format("%,5d Comparing %s:",
+ itemCount,
+ p1.getFileName().toString());
+
+ if (itemCount <= skip) {
+ System.out.println(" skipped");
+ return;
+ }
try {
CasIOUtils.load(new FileInputStream(p1.toFile()), c1);
@@ -139,17 +187,38 @@ public class XmiCompare {
} catch (IOException e) {
throw new RuntimeException(e);
}
-
- Iterator<FsIndex_singletype<FeatureStructure>> il1 = c1.indexRepository.streamNonEmptyIndexes(c1.getTypeSystemImpl().getTopType()).collect(Collectors.toList()).iterator();
- Iterator<FsIndex_singletype<FeatureStructure>> il2 = c2.indexRepository.streamNonEmptyIndexes(c2.getTypeSystemImpl().getTopType()).collect(Collectors.toList()).iterator();
+
+ compareNumberOfFSsByType();
+
+ cc = new CasCompare(c1, c2);
+ cc.compareAll(true);
+
+ // put temporary customizations here.
+// removeAllExcept("SomeType");
+
+ customizeCompare();
+
+ isOk = cc.compareCASes();
+
+ if (isOk) {
+ System.out.println(" compare:OK");
+ }
+
+ }
+
+ private void compareNumberOfFSsByType() {
+ Iterator<FsIndex_singletype<TOP>> il1 = c1.indexRepository.streamNonEmptyIndexes(TOP.class).collect(Collectors.toList()).iterator();
+ Iterator<FsIndex_singletype<TOP>> il2 = c2.indexRepository.streamNonEmptyIndexes(TOP.class).collect(Collectors.toList()).iterator();
+
StringBuilder sb = new StringBuilder();
+ StringBuilder sba = new StringBuilder();
boolean isSame = il1.hasNext() || il2.hasNext();
while( il1.hasNext() || il2.hasNext()) {
sb.setLength(0);
String ts1 = null, ts2 = null;
int sz1 = 0, sz2 = 0;
- FsIndex_singletype<FeatureStructure> idx;
+ FsIndex_singletype<TOP> idx;
if (il1.hasNext()) {
idx = il1.next();
ts1 = idx.getType().getName();
@@ -163,30 +232,288 @@ public class XmiCompare {
idx = il2.next();
ts2 = idx.getType().getName();
sz2 = idx.size();
- sb.append(String.format(" %,5d %s", sz2, ts2));
+ String m = (ts2.equals(ts1) && sz2 == sz1)
+ ? "same"
+ : ts2;
+ sb.append(String.format(" %,5d %s", sz2, m));
} else {
isSame = false;
}
- System.out.println(sb.toString());
+ sba.append(sb).append('\n');
if (isSame) {
isSame = ts1.equals(ts2) && sz1 == sz2;
}
}
- System.out.println(isSame ? "Same number of types" : "Different numbers of types");
-
- CasCompare cc = new CasCompare(c1, c2);
-// cc.compareStringArraysAsSets(true);
- cc.compareArraysByElement(true);
- cc.compareAll(true);
- isOk = cc.compareCASes();
+ if (!isSame) {
+ System.out.println(sba);
+ }
+ System.out.format(" %s", isSame ? "Same number of types" : "Different numbers of types");
+// System.out.println(" skp cpr");
+// if (true) return;
+ }
+
+ private void customizeCompare() {
+ if (isV2V2) {
+ customizeV2V2();
+ }
+
+ if (isV3V3) {
+ customizeV3V3();
+ }
- if (isOk) {
- System.out.println("\n***************\n" +
- "* COMPARE OK *\n" +
- "***************\n\n" );
+ if (isV2V3) {
+ customizeV2V3();
}
+ }
+
+ private void customizeV2V2() {
+// List<Runnable> r = sortFSArray("com.ibm.watsonx.nlp_med.common_types.UMLS.Concept", "innerConcepts");
+//
+// for (Runnable a : r) {
+// if (a != null) {
+// a.run();
+// }
+// }
+
+ // before sortStringArray
+// fixupTermMentionTypesUnknown();
+// cc.compareStringsAsEqual("com.ibm.watsonx.nlp_di.cas_term.Term", "mentionTypes", new String[]{"CATEGORY", "UNKNOWN"}, 1);
+// cc.compareStringsAsEqual("com.ibm.watsonx.nlp_di.cas_term.Term", "mentionTypes", new String[]{"CATEGORY", "UNKNOWN"}, 0);
+// cc.compareStringsAsEqual("com.ibm.watsonx.nlp_di.cas_term.Term", "mentionTypes", new String[]{"CATEGORY", "UNKNOWN"}, 2);
+
+// sortStringArray("com.ibm.watsonx.nlp_di.cas_term.Term", "mentionTypes");
+// v2FixupMentionType_mi("com.ibm.watsonx.nlp_di.hutt.UnitOfMeasurement");
+ cc.addStringCongruenceSet("com.ibm.watsonx.nlp_di.xsg.PseudoParagraph", "ruleId", new String[]{"Odd", "Even"}, -1);
+// fixupComponentId("com.ibm.watsonx.nlp_di.hutt.Person",
+// "R2/2.2.2/Pre/NONDEFROLE/\\Monitor",
+// "R2/2.2.2/Pre/NONDEFROLE/\\monitor");
+// fixupComponentId("com.ibm.watsonx.nlp_di.hutt.NondefiningRole",
+// "R2/2.2.2/Pre/NONDEFROLE/\\Monitor",
+// "R2/2.2.2/Pre/NONDEFROLE/\\monitor");
+//
+// fixupComponentId("com.ibm.watsonx.nlp_di.hutt.Symptom",
+// "R2/2.2.2/Pre/NONDEFROLE/\\Sleeping",
+// "R2/2.2.2/Pre/NONDEFROLE/\\sleeping");
+// fixupComponentId("com.ibm.watsonx.nlp_di.hutt.Symptom",
+// "R2/2.2.2/Main/SYMPTOM/\\Sleeping",
+// "R2/2.2.2/Main/SYMPTOM/\\sleeping");
+ }
+
+ private void customizeV3V3() {
+ List<Runnable> r = new ArrayList<>();
+// r.addAll(sortFSArray("com.ibm.watsonx.nlp_di.cas_term.Term", "outgoingLinks"));
+// r.addAll(sortFSArray("com.ibm.watsonx.nlp_di.cas_term.Term", "incomingLinks"));
+// r.addAll(sortFSArray("com.ibm.watsonx.nlp_di.hutt.Predicate", "arguments"));
+// r.addAll(sortFSArray("com.ibm.watsonx.nlp_di.cas_term.Expression", "termLinks"));
+// r.addAll(sortFSArray("com.ibm.watsonx.nlp_med.common_types.UMLS.Concept", "innerConcepts"));
+// r.addAll(sortFSArray("com.ibm.watsonx.nlp_di.common_types.generic_relation.GenericRelation", "args"));
+// r.addAll(sortFSArray("com.ibm.watsonx.nlp_di.hutt.Predicate", "sources"));
+
+
+ for (Runnable a : r) {
+ if (a != null) {
+ a.run();
+ }
+ }
+
+// v2FixupMentionType_mi("com.ibm.watsonx.nlp_di.hutt.UnitOfMeasurement");
+// cc.addStringCongruenceSet("com.ibm.watsonx.nlp_di.xsg.PseudoParagraph", "ruleId", new String[]{"Odd", "Even"}, -1);
+// fixupTermMentionTypesUnknown();
+// fixupComponentId("com.ibm.watsonx.nlp_di.hutt.UsState",
+// "R2/2.2.2/Pre/STATE/State",
+// "R2/2.2.2/Pre/STATE/state");
+// fixupComponentId("com.ibm.watsonx.nlp_di.hutt.UsState",
+// "R2/2.2.2/Pre/STATE/\\States",
+// "R2/2.2.2/Pre/STATE/\\states");
+ }
+
+ private void customizeV2V3() {
+ List<Runnable> r = new ArrayList<>();
+// r = sortFSArray("com.ibm.watsonx.nlp_di.cas_term.Term", "outgoingLinks");
+// r.addAll(sortFSArray("com.ibm.watsonx.nlp_di.cas_term.Term", "incomingLinks"));
+// r.addAll(sortFSArray("com.ibm.watsonx.nlp_di.hutt.Predicate", "arguments"));
+// r.addAll(sortFSArray("com.ibm.watsonx.nlp_di.cas_term.Expression", "termLinks"));
+// r.addAll(sortFSArray("com.ibm.watsonx.nlp_med.common_types.UMLS.Concept", "innerConcepts"));
+// r.addAll(sortFSArray("com.ibm.watsonx.nlp_di.common_types.generic_relation.GenericRelation", "args"));
+// r.addAll(sortFSArray("com.ibm.watsonx.nlp_di.hutt.Predicate", "sources"));
+
+ for (Runnable a : r) {
+ if (a != null) {
+ a.run();
+ }
+ }
+
+ // from v2:
+ // before sortStringArray
+// fixupTermMentionTypesUnknown();
+// sortStringArray("com.ibm.watsonx.nlp_di.cas_term.Term", "mentionTypes");
+// v2FixupMentionType_mi("com.ibm.watsonx.nlp_di.hutt.UnitOfMeasurement");
+// fixupComponentId("com.ibm.watsonx.nlp_di.hutt.Person",
+// "R2/2.2.2/Pre/NONDEFROLE/\\Monitor",
+// "R2/2.2.2/Pre/NONDEFROLE/\\monitor");
+// fixupComponentId("com.ibm.watsonx.nlp_di.hutt.NondefiningRole",
+// "R2/2.2.2/Pre/NONDEFROLE/\\Monitor",
+// "R2/2.2.2/Pre/NONDEFROLE/\\monitor");
+//
+// fixupComponentId("com.ibm.watsonx.nlp_di.hutt.Symptom",
+// "R2/2.2.2/Pre/NONDEFROLE/\\Sleeping",
+// "R2/2.2.2/Pre/NONDEFROLE/\\sleeping");
+// fixupComponentId("com.ibm.watsonx.nlp_di.hutt.Symptom",
+// "R2/2.2.2/Main/SYMPTOM/\\Sleeping",
+// "R2/2.2.2/Main/SYMPTOM/\\sleeping");
+//
+//
+//
+// sortStringArray("com.ibm.watsonx.nlp_di.hutt.Predicate", "argumentLabels");
+// canonicalizeStringFirstVariant("com.ibm.watsonx.nlp_med.common_types.UMLS.SignOrSymptom", "conceptName", "variants");
+ cc.addStringCongruenceSet("com.ibm.watsonx.nlp_di.xsg.PseudoParagraph", "ruleId", new String[]{"Odd", "Even"}, -1);
+ canonicalizeString("com.ibm.watsonx.nlp_di.xsg.PseudoParagraph", "ruleId", new String[]{"Odd", "Even"}, "Odd");
+ }
- }
-
+ private List<Runnable> sortFSArray(String typename, String featurename) {
+ List<Runnable> r = sortFSArray(typename, featurename, c1);
+ r.addAll(sortFSArray(typename, featurename, c2));
+ return r;
+ }
+
+ private void v2FixupMentionType_mi(String t) {
+ v2FixupMentionType_mi(t, c1);
+ v2FixupMentionType_mi(t, c2);
+ }
+
+ private void v2FixupMentionType_mi(String t, CASImpl cas) {
+ TypeSystem ts = cas.getTypeSystem();
+ Type type = ts.getType(t);
+ Feature f_mentionType = type.getFeatureByBaseName("mentionType");
+ Feature f_componentId = type.getFeatureByBaseName("componentId");
+ cas.select(type)
+ .allViews()
+ .filter(fs -> "R2/2.2.2/Main/UNITOFM/_mi".equals(fs.getStringValue(f_componentId)))
+ .forEach(fs -> {
+ fs.setStringValue(f_componentId, "R2/2.2.2/Main/UNITOFM/_mile");
+ fs.setStringValue(f_mentionType, "CATEGORY");
+ });
+ }
+
+ private void fixupComponentId(String t, String s1, String s2) {
+ fixupComponentId(t, s1, s2, c1);
+ fixupComponentId(t, s1, s2, c2);
+ }
+
+ private void fixupComponentId(String t, String s1, String s2, CASImpl cas) {
+ TypeSystem ts = cas.getTypeSystem();
+ Type type = ts.getType(t);
+ Feature f_componentId = type.getFeatureByBaseName("componentId");
+ cas.select(type)
+ .allViews()
+ .filter(fs -> s1.equals(fs.getStringValue(f_componentId)))
+ .forEach(fs -> {
+ fs.setStringValue(f_componentId, s2);
+ });
+ }
+
+ private void fixupTermMentionTypesUnknown() {
+ fixupTermMentionTypesUnknown(c1);
+ fixupTermMentionTypesUnknown(c2);
+ }
+
+ private void fixupTermMentionTypesUnknown(CASImpl cas) {
+ TypeSystem ts = cas.getTypeSystem();
+ Type type = ts.getType("com.ibm.watsonx.nlp_di.cas_term.Term");
+ Feature f_mentionTypes = type.getFeatureByBaseName("mentionTypes");
+ cas.select(type)
+ .allViews()
+ .map(fs -> (StringArray) fs.getFeatureValue(f_mentionTypes))
+ .filter(fs -> fs != null && fs.contains("UNKNOWN"))
+ .forEach(fs ->
+ {
+ for (int i = 0; i < fs.size(); i++) {
+ if ("UNKNOWN".equals(fs.get(i))) {
+ fs.set(i, "CATEGORY");
+ }
+ }
+ });
+ }
+
+ private List<Runnable> sortFSArray(String typename, String featurename, CASImpl cas) {
+ TypeSystem ts = cas.getTypeSystem();
+ Type type = ts.getType(typename);
+ Feature feat = ts.getFeatureByFullName(typename + ":" + featurename);
+ return cas.select(type).allViews().map(fs ->
+ cc.sortFSArray((FSArray)fs.getFeatureValue(feat))).collect(Collectors.toList());
+ }
+
+ private void sortStringArray(String t, String f) {
+ sortStringArray(t, f, c1);
+ sortStringArray(t, f, c2);
+ }
+
+ private void sortStringArray(String t, String f, CASImpl cas) {
+ TypeSystem ts = cas.getTypeSystem();
+ Type type = ts.getType(t);
+ Feature feat = ts.getFeatureByFullName(t + ":" + f);
+ cas.select(type).allViews().forEach(fs ->
+ { StringArray sa = (StringArray) fs.getFeatureValue(feat);
+ if (sa != null && sa.size() > 2) {
+ Arrays.sort(sa._getTheArray());
+ }
+ });
+ }
+
+
+
+ private void canonicalizeStringFirstVariant(String t, String f, String v) {
+ canonicalizeStringFirstVariant(t, f, v, c1);
+ canonicalizeStringFirstVariant(t, f, v, c2);
+ }
+
+ void canonicalizeStringFirstVariant(String t, String f, String v, CASImpl cas) {
+ TypeSystem ts = cas.getTypeSystem();
+ Type type = ts.getType(t);
+ Feature feat = ts.getFeatureByFullName(t + ":" + f);
+ Feature featv = ts.getFeatureByFullName(t + ":" + v); // v is the variant array
+ cas.select(type).allViews().forEach(fs ->
+ { StringArray sa = (StringArray) fs.getFeatureValue(featv);
+ if (sa != null && sa.size() > 2) {
+ String item = fs.getStringValue(feat);
+ if (sa.contains(item)) {
+ fs.setStringValue(feat, sa.get(0));
+ }
+ }
+ });
+ }
+
+ private void canonicalizeString(String t, String f, String[] filter, String cv) {
+ canonicalizeString(t, f, filter, cv, c1);
+ canonicalizeString(t, f, filter, cv, c2);
+ }
+
+ void canonicalizeString(String t, String f, String[] filter, String cv, CASImpl cas) {
+ TypeSystem ts = cas.getTypeSystem();
+ Type type = ts.getType(t);
+ Feature feat = ts.getFeatureByFullName(t + ":" + f);
+ cas.select(type).allViews().forEach(fs ->
+ { String item = fs.getStringValue(feat);
+ if (Misc.contains(filter, item)) {
+ fs.setStringValue(feat, cv);
+ }
+ });
+ }
+
+ void removeAllExcept(String v) {
+ removeAllExcept(v, c1);
+ removeAllExcept(v, c2);
+ }
+
+ void removeAllExcept(String v, CASImpl c) {
+ Iterator<AnnotationFS> it = c.getAnnotationIndex().iterator();
+ while (it.hasNext()) {
+ TOP item = (TOP) it.next();
+ if (item._getTypeImpl().getName().contains(v)) {
+ continue;
+ }
+ item.removeFromIndexes();
+ }
+ }
}