You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by st...@apache.org on 2013/07/17 22:34:27 UTC

svn commit: r1504269 - /ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/data/analysis/CompareFeatureStructures.java

Author: stevenbethard
Date: Wed Jul 17 20:34:26 2013
New Revision: 1504269

URL: http://svn.apache.org/r1504269
Log:
Adds first draft of class to generate "diffs" of CAS feature structures.

Added:
    ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/data/analysis/CompareFeatureStructures.java

Added: ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/data/analysis/CompareFeatureStructures.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/data/analysis/CompareFeatureStructures.java?rev=1504269&view=auto
==============================================================================
--- ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/data/analysis/CompareFeatureStructures.java (added)
+++ ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/data/analysis/CompareFeatureStructures.java Wed Jul 17 20:34:26 2013
@@ -0,0 +1,312 @@
+package org.apache.ctakes.temporal.data.analysis;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import javax.annotation.Nullable;
+
+import org.apache.uima.cas.CASException;
+import org.apache.uima.cas.FSIterator;
+import org.apache.uima.cas.Feature;
+import org.apache.uima.cas.FeatureStructure;
+import org.apache.uima.cas.Type;
+import org.apache.uima.cas.impl.XmiCasDeserializer;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.cas.FSArray;
+import org.apache.uima.jcas.cas.NonEmptyFSList;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.uimafit.factory.JCasFactory;
+import org.uimafit.util.JCasUtil;
+
+import com.google.common.base.Function;
+import com.google.common.base.Joiner;
+import com.google.common.base.Objects;
+import com.google.common.collect.Iterables;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Multimap;
+import com.google.common.collect.Ordering;
+import com.google.common.collect.TreeMultimap;
+import com.lexicalscope.jewel.cli.CliFactory;
+import com.lexicalscope.jewel.cli.Option;
+
+public class CompareFeatureStructures {
+  static interface Options {
+    @Option(longName = "dir1")
+    public File getDirectory1();
+
+    @Option(longName = "dir2")
+    public File getDirectory2();
+
+    @Option(longName = "roots", defaultValue = {
+        "org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation",
+        "org.apache.ctakes.typesystem.type.relation.Relation" })
+    public List<String> getAnnotationClassNames();
+  }
+
+  public static void main(String[] args) throws Exception {
+    Options options = CliFactory.parseArguments(Options.class, args);
+    List<Class<?>> annotationClasses = Lists.newArrayList();
+    for (String annotationClassName : options.getAnnotationClassNames()) {
+      annotationClasses.add(Class.forName(annotationClassName));
+    }
+    File dir1 = options.getDirectory1();
+    File dir2 = options.getDirectory2();
+    if (!Arrays.equals(dir1.list(), dir2.list())) {
+      System.err.printf("%s and %s contain different files", dir1, dir2);
+    } else {
+      for (String fileName : dir1.list()) {
+        System.err.printf("== Checking %s ===\n", fileName);
+        JCas jCas1 = readXMI(new File(dir1, fileName));
+        JCas jCas2 = readXMI(new File(dir2, fileName));
+        List<String> viewNames1 = getViewNames(jCas1);
+        List<String> viewNames2 = getViewNames(jCas2);
+        if (areEqual("view-names", viewNames1, viewNames2)) {
+          for (String viewName : viewNames1) {
+            JCas view1 = jCas1.getView(viewName);
+            JCas view2 = jCas2.getView(viewName);
+            for (Class<?> annotationClass : annotationClasses) {
+              Multimap<Type, FeatureStructure> fsMap1 = toSortedMultimap(view1, annotationClass);
+              Multimap<Type, FeatureStructure> fsMap2 = toSortedMultimap(view2, annotationClass);
+              if (areEqual("annotation-counts", fsMap1.keys(), fsMap2.keys())) {
+                for (Type type : fsMap1.keySet()) {
+                  Iterator<FeatureStructure> fsIter1 = fsMap1.get(type).iterator();
+                  Iterator<FeatureStructure> fsIter2 = fsMap2.get(type).iterator();
+                  while (fsIter1.hasNext() && fsIter2.hasNext()) {
+                    FeatureStructure fs1 = fsIter1.next();
+                    FeatureStructure fs2 = fsIter2.next();
+                    FSDiff diff = new FSDiff(fs1, fs2);
+                    if (diff.hasDifferences()) {
+                      System.err.println(diff);
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+
+  }
+
+  private static JCas readXMI(File xmiFile) throws Exception {
+    JCas jCas = JCasFactory.createJCas();
+    FileInputStream inputStream = new FileInputStream(xmiFile);
+    try {
+      XmiCasDeserializer.deserialize(inputStream, jCas.getCas());
+    } finally {
+      inputStream.close();
+    }
+    return jCas;
+  }
+
+  private static List<String> getViewNames(JCas jCas) throws CASException {
+    List<String> names = Lists.newArrayList();
+    Iterator<JCas> views = jCas.getViewIterator();
+    while (views.hasNext()) {
+      names.add(views.next().getViewName());
+    }
+    return names;
+  }
+
+  private static boolean areEqual(String name, Object o1, Object o2) {
+    boolean areEqual = Objects.equal(o1, o2);
+    if (!areEqual) {
+      System.err.printf("Difference in %s:\n-%s\n+%s\n", name, o1, o2);
+    }
+    return areEqual;
+  }
+
+  private static Multimap<Type, FeatureStructure> toSortedMultimap(
+      JCas jCas,
+      Class<?> annotationClass) {
+    Type type = JCasUtil.getType(jCas, annotationClass);
+    FSIterator<FeatureStructure> fsIterator = jCas.getFSIndexRepository().getAllIndexedFS(type);
+    Multimap<Type, FeatureStructure> result = TreeMultimap.create(BY_NAME, BY_OFFSETS);
+    while (fsIterator.hasNext()) {
+      FeatureStructure fs = fsIterator.next();
+      result.put(fs.getType(), fs);
+    }
+    return result;
+  }
+  
+  private static final Ordering<Type> BY_NAME = Ordering.natural().onResultOf(
+      new Function<Type, String>() {
+        @Override
+        public String apply(@Nullable Type input) {
+          return input.getName();
+        }
+      });
+
+  private static final Ordering<FeatureStructure> BY_OFFSETS =
+      Ordering.natural().<Integer> lexicographical().onResultOf(
+          new Function<FeatureStructure, Iterable<Integer>>() {
+            @Override
+            public Iterable<Integer> apply(@Nullable FeatureStructure input) {
+              List<Integer> offsets = Lists.newArrayList();
+              if (input != null) {
+                if (input instanceof Annotation) {
+                  Annotation annotation = (Annotation) input;
+                  offsets.add(annotation.getBegin());
+                  offsets.add(annotation.getEnd());
+                } else if (input instanceof FSArray) {
+                  FSArray fsArray = (FSArray) input;
+                  for (int i = 0; i < fsArray.size(); ++i) {
+                    Iterables.addAll(offsets, this.apply(fsArray.get(i)));
+                  }
+                } else if (input instanceof NonEmptyFSList) {
+                  NonEmptyFSList fsList = (NonEmptyFSList) input;
+                  Iterables.addAll(offsets, this.apply(fsList.getHead()));
+                  Iterables.addAll(offsets, this.apply(fsList.getTail()));
+                } else {
+                  for (Feature feature : input.getType().getFeatures()) {
+                    if (!feature.getRange().isPrimitive()) {
+                      Iterables.addAll(offsets, this.apply(input.getFeatureValue(feature)));
+                    }
+                  }
+                }
+              }
+              return offsets;
+            }
+          });
+
+  public static class FSDiff {
+    private List<FSDifference> differences;
+    private FeatureStructure root1, root2;
+
+    public FSDiff(FeatureStructure root1, FeatureStructure root2) {
+      this.root1 = root1;
+      this.root2 = root2;
+      this.differences = Lists.newArrayList();
+      this.findDifferences(
+          this.root1,
+          this.root2,
+          Lists.<Feature> newArrayList(),
+          Lists.<FeatureStructure> newArrayList());
+    }
+
+    public boolean hasDifferences() {
+      return !this.differences.isEmpty();
+    }
+
+    @Override
+    public String toString() {
+      String diff;
+      if (!this.hasDifferences()) {
+        diff = "";
+      } else {
+        List<String> paths = Lists.newArrayList();
+        for (FSDifference difference : this.differences) {
+          List<String> featureNames = Lists.newArrayList();
+          for (Feature feature : difference.getPath()) {
+            featureNames.add(feature.getShortName());
+          }
+          paths.add(Joiner.on('/').join(featureNames));
+        }
+        diff = this.root1.toString();
+        for (FSDifference difference : this.differences) {
+          String value1 = difference.getValue1().toString().trim();
+          String value2 = difference.getValue2().toString().trim();
+          String value1space = value1.replaceAll("\\s+", "\\\\s+");
+          Pattern pattern =
+              Pattern.compile(String.format("^(.*?)(%s)", value1space), Pattern.MULTILINE);
+          Matcher matcher = pattern.matcher(diff);
+          StringBuffer buffer = new StringBuffer();
+          while (matcher.find()) {
+            String prefix = matcher.group(1);
+            String replacement;
+            // don't re-replace things that have already been taken care of
+            if (prefix.startsWith("-") || prefix.startsWith("+")) {
+              replacement = matcher.group();
+            }
+            // replace the current text with diff-style +/- text
+            else {
+              Matcher indentMatcher = Pattern.compile("^\\s*").matcher(prefix);
+              indentMatcher.find();
+              String indent = indentMatcher.group();
+              replacement =
+                  String.format(
+                      "%s%s\n%s%s",
+                      "-" + prefix,
+                      value1.replaceAll("\n", "\n-" + indent),
+                      "+" + prefix,
+                      value2.replaceAll("\n", "\n+" + indent));
+            }
+            matcher.appendReplacement(buffer, replacement);
+          }
+          matcher.appendTail(buffer);
+          diff = buffer.toString();
+        }
+        diff = diff.replaceAll("(?m)^(?![+-])", " ");
+        diff = String.format("Difference in %s:\n%s", paths, diff);
+      }
+      return diff;
+    }
+
+    private void findDifferences(
+        FeatureStructure fs1,
+        FeatureStructure fs2,
+        List<Feature> featurePath,
+        List<FeatureStructure> seen) {
+      if (!seen.contains(fs1) && !seen.contains(fs2)) {
+        seen.add(fs1);
+        seen.add(fs2);
+        for (Feature feature : fs1.getType().getFeatures()) {
+          if (feature.getName().equals("uima.cas.AnnotationBase:sofa")) {
+            continue;
+          }
+          List<Feature> newPath = Lists.newArrayList(featurePath);
+          newPath.add(feature);
+          if (feature.getRange().isPrimitive()) {
+            String value1 = fs1.getFeatureValueAsString(feature);
+            String value2 = fs2.getFeatureValueAsString(feature);
+            if (!Objects.equal(value1, value2)) {
+              this.differences.add(new FSDifference(newPath, value1, value2));
+            }
+          } else {
+            FeatureStructure value1 = fs1.getFeatureValue(feature);
+            FeatureStructure value2 = fs2.getFeatureValue(feature);
+            if (value1 == null
+                || value2 == null
+                || !value1.getType().getName().equals(value2.getType().getName())) {
+              if (!Objects.equal(value1, value2)) {
+                this.differences.add(new FSDifference(newPath, value1, value2));
+              }
+            } else {
+              this.findDifferences(value1, value2, newPath, seen);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  public static class FSDifference {
+
+    private List<Feature> path;
+    private Object value1, value2;
+
+    public FSDifference(List<Feature> path, Object value1, Object value2) {
+      this.path = path;
+      this.value1 = value1;
+      this.value2 = value2;
+    }
+
+    public List<Feature> getPath() {
+      return path;
+    }
+
+    public Object getValue1() {
+      return value1;
+    }
+
+    public Object getValue2() {
+      return value2;
+    }
+  }
+}



Re: svn commit: r1504269 - /ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/data/analysis/CompareFeatureStructures.java

Posted by Steven Bethard <st...@Colorado.EDU>.
On 17 Jul2013, at 15:52 , "Chen, Pei" <Pe...@childrens.harvard.edu> wrote:
> Having a general Util to compare CAS feature structures would be so useful on so many levels.
> Could we create a Jira for this so that it will help facilitate dev awareness of these utilities when we generate release notes?
> I've been looking for something like this from the UIMA tools for a while…

Done: https://issues.apache.org/jira/browse/CTAKES-217

As I note in the issue, the current CompareFeatureStructures is very much a draft, but if anyone is interested in making this more generally useful, I'd welcome the help.

Steve

>
>> -----Original Message-----
>> From: stevenbethard@apache.org [mailto:stevenbethard@apache.org]
>> Sent: Wednesday, July 17, 2013 4:35 PM
>> To: commits@ctakes.apache.org
>> Subject: svn commit: r1504269 - /ctakes/trunk/ctakes-
>> temporal/src/main/java/org/apache/ctakes/temporal/data/analysis/Compar
>> eFeatureStructures.java
>>
>> Author: stevenbethard
>> Date: Wed Jul 17 20:34:26 2013
>> New Revision: 1504269
>>
>> URL: http://svn.apache.org/r1504269
>> Log:
>> Adds first draft of class to generate "diffs" of CAS feature structures.
>>
>> Added:
>>    ctakes/trunk/ctakes-
>> temporal/src/main/java/org/apache/ctakes/temporal/data/analysis/Compar
>> eFeatureStructures.java
>>
>> Added: ctakes/trunk/ctakes-
>> temporal/src/main/java/org/apache/ctakes/temporal/data/analysis/Compar
>> eFeatureStructures.java
>> URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-
>> temporal/src/main/java/org/apache/ctakes/temporal/data/analysis/Compar
>> eFeatureStructures.java?rev=1504269&view=auto
>> ==========================================================
>> ====================
>> --- ctakes/trunk/ctakes-
>> temporal/src/main/java/org/apache/ctakes/temporal/data/analysis/Compar
>> eFeatureStructures.java (added)
>> +++ ctakes/trunk/ctakes-
>> temporal/src/main/java/org/apache/ctakes/tempora
>> +++ l/data/analysis/CompareFeatureStructures.java Wed Jul 17 20:34:26
>> +++ 2013
>> @@ -0,0 +1,312 @@
>> +package org.apache.ctakes.temporal.data.analysis;
>> +
>> +import java.io.File;
>> +import java.io.FileInputStream;
>> +import java.util.Arrays;
>> +import java.util.Iterator;
>> +import java.util.List;
>> +import java.util.regex.Matcher;
>> +import java.util.regex.Pattern;
>> +
>> +import javax.annotation.Nullable;
>> +
>> +import org.apache.uima.cas.CASException; import
>> +org.apache.uima.cas.FSIterator; import org.apache.uima.cas.Feature;
>> +import org.apache.uima.cas.FeatureStructure;
>> +import org.apache.uima.cas.Type;
>> +import org.apache.uima.cas.impl.XmiCasDeserializer;
>> +import org.apache.uima.jcas.JCas;
>> +import org.apache.uima.jcas.cas.FSArray; import
>> +org.apache.uima.jcas.cas.NonEmptyFSList;
>> +import org.apache.uima.jcas.tcas.Annotation;
>> +import org.uimafit.factory.JCasFactory; import
>> +org.uimafit.util.JCasUtil;
>> +
>> +import com.google.common.base.Function; import
>> +com.google.common.base.Joiner; import
>> com.google.common.base.Objects;
>> +import com.google.common.collect.Iterables;
>> +import com.google.common.collect.Lists; import
>> +com.google.common.collect.Multimap;
>> +import com.google.common.collect.Ordering;
>> +import com.google.common.collect.TreeMultimap;
>> +import com.lexicalscope.jewel.cli.CliFactory;
>> +import com.lexicalscope.jewel.cli.Option;
>> +
>> +public class CompareFeatureStructures {
>> +  static interface Options {
>> +    @Option(longName = "dir1")
>> +    public File getDirectory1();
>> +
>> +    @Option(longName = "dir2")
>> +    public File getDirectory2();
>> +
>> +    @Option(longName = "roots", defaultValue = {
>> +        "org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation",
>> +        "org.apache.ctakes.typesystem.type.relation.Relation" })
>> +    public List<String> getAnnotationClassNames();  }
>> +
>> +  public static void main(String[] args) throws Exception {
>> +    Options options = CliFactory.parseArguments(Options.class, args);
>> +    List<Class<?>> annotationClasses = Lists.newArrayList();
>> +    for (String annotationClassName : options.getAnnotationClassNames()) {
>> +      annotationClasses.add(Class.forName(annotationClassName));
>> +    }
>> +    File dir1 = options.getDirectory1();
>> +    File dir2 = options.getDirectory2();
>> +    if (!Arrays.equals(dir1.list(), dir2.list())) {
>> +      System.err.printf("%s and %s contain different files", dir1, dir2);
>> +    } else {
>> +      for (String fileName : dir1.list()) {
>> +        System.err.printf("== Checking %s ===\n", fileName);
>> +        JCas jCas1 = readXMI(new File(dir1, fileName));
>> +        JCas jCas2 = readXMI(new File(dir2, fileName));
>> +        List<String> viewNames1 = getViewNames(jCas1);
>> +        List<String> viewNames2 = getViewNames(jCas2);
>> +        if (areEqual("view-names", viewNames1, viewNames2)) {
>> +          for (String viewName : viewNames1) {
>> +            JCas view1 = jCas1.getView(viewName);
>> +            JCas view2 = jCas2.getView(viewName);
>> +            for (Class<?> annotationClass : annotationClasses) {
>> +              Multimap<Type, FeatureStructure> fsMap1 =
>> toSortedMultimap(view1, annotationClass);
>> +              Multimap<Type, FeatureStructure> fsMap2 =
>> toSortedMultimap(view2, annotationClass);
>> +              if (areEqual("annotation-counts", fsMap1.keys(), fsMap2.keys())) {
>> +                for (Type type : fsMap1.keySet()) {
>> +                  Iterator<FeatureStructure> fsIter1 = fsMap1.get(type).iterator();
>> +                  Iterator<FeatureStructure> fsIter2 = fsMap2.get(type).iterator();
>> +                  while (fsIter1.hasNext() && fsIter2.hasNext()) {
>> +                    FeatureStructure fs1 = fsIter1.next();
>> +                    FeatureStructure fs2 = fsIter2.next();
>> +                    FSDiff diff = new FSDiff(fs1, fs2);
>> +                    if (diff.hasDifferences()) {
>> +                      System.err.println(diff);
>> +                    }
>> +                  }
>> +                }
>> +              }
>> +            }
>> +          }
>> +        }
>> +      }
>> +    }
>> +
>> +  }
>> +
>> +  private static JCas readXMI(File xmiFile) throws Exception {
>> +    JCas jCas = JCasFactory.createJCas();
>> +    FileInputStream inputStream = new FileInputStream(xmiFile);
>> +    try {
>> +      XmiCasDeserializer.deserialize(inputStream, jCas.getCas());
>> +    } finally {
>> +      inputStream.close();
>> +    }
>> +    return jCas;
>> +  }
>> +
>> +  private static List<String> getViewNames(JCas jCas) throws CASException {
>> +    List<String> names = Lists.newArrayList();
>> +    Iterator<JCas> views = jCas.getViewIterator();
>> +    while (views.hasNext()) {
>> +      names.add(views.next().getViewName());
>> +    }
>> +    return names;
>> +  }
>> +
>> +  private static boolean areEqual(String name, Object o1, Object o2) {
>> +    boolean areEqual = Objects.equal(o1, o2);
>> +    if (!areEqual) {
>> +      System.err.printf("Difference in %s:\n-%s\n+%s\n", name, o1, o2);
>> +    }
>> +    return areEqual;
>> +  }
>> +
>> +  private static Multimap<Type, FeatureStructure> toSortedMultimap(
>> +      JCas jCas,
>> +      Class<?> annotationClass) {
>> +    Type type = JCasUtil.getType(jCas, annotationClass);
>> +    FSIterator<FeatureStructure> fsIterator =
>> jCas.getFSIndexRepository().getAllIndexedFS(type);
>> +    Multimap<Type, FeatureStructure> result =
>> TreeMultimap.create(BY_NAME, BY_OFFSETS);
>> +    while (fsIterator.hasNext()) {
>> +      FeatureStructure fs = fsIterator.next();
>> +      result.put(fs.getType(), fs);
>> +    }
>> +    return result;
>> +  }
>> +
>> +  private static final Ordering<Type> BY_NAME =
>> Ordering.natural().onResultOf(
>> +      new Function<Type, String>() {
>> +        @Override
>> +        public String apply(@Nullable Type input) {
>> +          return input.getName();
>> +        }
>> +      });
>> +
>> +  private static final Ordering<FeatureStructure> BY_OFFSETS =
>> +      Ordering.natural().<Integer> lexicographical().onResultOf(
>> +          new Function<FeatureStructure, Iterable<Integer>>() {
>> +            @Override
>> +            public Iterable<Integer> apply(@Nullable FeatureStructure input) {
>> +              List<Integer> offsets = Lists.newArrayList();
>> +              if (input != null) {
>> +                if (input instanceof Annotation) {
>> +                  Annotation annotation = (Annotation) input;
>> +                  offsets.add(annotation.getBegin());
>> +                  offsets.add(annotation.getEnd());
>> +                } else if (input instanceof FSArray) {
>> +                  FSArray fsArray = (FSArray) input;
>> +                  for (int i = 0; i < fsArray.size(); ++i) {
>> +                    Iterables.addAll(offsets, this.apply(fsArray.get(i)));
>> +                  }
>> +                } else if (input instanceof NonEmptyFSList) {
>> +                  NonEmptyFSList fsList = (NonEmptyFSList) input;
>> +                  Iterables.addAll(offsets, this.apply(fsList.getHead()));
>> +                  Iterables.addAll(offsets, this.apply(fsList.getTail()));
>> +                } else {
>> +                  for (Feature feature : input.getType().getFeatures()) {
>> +                    if (!feature.getRange().isPrimitive()) {
>> +                      Iterables.addAll(offsets,
>> this.apply(input.getFeatureValue(feature)));
>> +                    }
>> +                  }
>> +                }
>> +              }
>> +              return offsets;
>> +            }
>> +          });
>> +
>> +  public static class FSDiff {
>> +    private List<FSDifference> differences;
>> +    private FeatureStructure root1, root2;
>> +
>> +    public FSDiff(FeatureStructure root1, FeatureStructure root2) {
>> +      this.root1 = root1;
>> +      this.root2 = root2;
>> +      this.differences = Lists.newArrayList();
>> +      this.findDifferences(
>> +          this.root1,
>> +          this.root2,
>> +          Lists.<Feature> newArrayList(),
>> +          Lists.<FeatureStructure> newArrayList());
>> +    }
>> +
>> +    public boolean hasDifferences() {
>> +      return !this.differences.isEmpty();
>> +    }
>> +
>> +    @Override
>> +    public String toString() {
>> +      String diff;
>> +      if (!this.hasDifferences()) {
>> +        diff = "";
>> +      } else {
>> +        List<String> paths = Lists.newArrayList();
>> +        for (FSDifference difference : this.differences) {
>> +          List<String> featureNames = Lists.newArrayList();
>> +          for (Feature feature : difference.getPath()) {
>> +            featureNames.add(feature.getShortName());
>> +          }
>> +          paths.add(Joiner.on('/').join(featureNames));
>> +        }
>> +        diff = this.root1.toString();
>> +        for (FSDifference difference : this.differences) {
>> +          String value1 = difference.getValue1().toString().trim();
>> +          String value2 = difference.getValue2().toString().trim();
>> +          String value1space = value1.replaceAll("\\s+", "\\\\s+");
>> +          Pattern pattern =
>> +              Pattern.compile(String.format("^(.*?)(%s)", value1space),
>> Pattern.MULTILINE);
>> +          Matcher matcher = pattern.matcher(diff);
>> +          StringBuffer buffer = new StringBuffer();
>> +          while (matcher.find()) {
>> +            String prefix = matcher.group(1);
>> +            String replacement;
>> +            // don't re-replace things that have already been taken care of
>> +            if (prefix.startsWith("-") || prefix.startsWith("+")) {
>> +              replacement = matcher.group();
>> +            }
>> +            // replace the current text with diff-style +/- text
>> +            else {
>> +              Matcher indentMatcher = Pattern.compile("^\\s*").matcher(prefix);
>> +              indentMatcher.find();
>> +              String indent = indentMatcher.group();
>> +              replacement =
>> +                  String.format(
>> +                      "%s%s\n%s%s",
>> +                      "-" + prefix,
>> +                      value1.replaceAll("\n", "\n-" + indent),
>> +                      "+" + prefix,
>> +                      value2.replaceAll("\n", "\n+" + indent));
>> +            }
>> +            matcher.appendReplacement(buffer, replacement);
>> +          }
>> +          matcher.appendTail(buffer);
>> +          diff = buffer.toString();
>> +        }
>> +        diff = diff.replaceAll("(?m)^(?![+-])", " ");
>> +        diff = String.format("Difference in %s:\n%s", paths, diff);
>> +      }
>> +      return diff;
>> +    }
>> +
>> +    private void findDifferences(
>> +        FeatureStructure fs1,
>> +        FeatureStructure fs2,
>> +        List<Feature> featurePath,
>> +        List<FeatureStructure> seen) {
>> +      if (!seen.contains(fs1) && !seen.contains(fs2)) {
>> +        seen.add(fs1);
>> +        seen.add(fs2);
>> +        for (Feature feature : fs1.getType().getFeatures()) {
>> +          if (feature.getName().equals("uima.cas.AnnotationBase:sofa")) {
>> +            continue;
>> +          }
>> +          List<Feature> newPath = Lists.newArrayList(featurePath);
>> +          newPath.add(feature);
>> +          if (feature.getRange().isPrimitive()) {
>> +            String value1 = fs1.getFeatureValueAsString(feature);
>> +            String value2 = fs2.getFeatureValueAsString(feature);
>> +            if (!Objects.equal(value1, value2)) {
>> +              this.differences.add(new FSDifference(newPath, value1, value2));
>> +            }
>> +          } else {
>> +            FeatureStructure value1 = fs1.getFeatureValue(feature);
>> +            FeatureStructure value2 = fs2.getFeatureValue(feature);
>> +            if (value1 == null
>> +                || value2 == null
>> +                ||
>> !value1.getType().getName().equals(value2.getType().getName())) {
>> +              if (!Objects.equal(value1, value2)) {
>> +                this.differences.add(new FSDifference(newPath, value1, value2));
>> +              }
>> +            } else {
>> +              this.findDifferences(value1, value2, newPath, seen);
>> +            }
>> +          }
>> +        }
>> +      }
>> +    }
>> +  }
>> +
>> +  public static class FSDifference {
>> +
>> +    private List<Feature> path;
>> +    private Object value1, value2;
>> +
>> +    public FSDifference(List<Feature> path, Object value1, Object value2) {
>> +      this.path = path;
>> +      this.value1 = value1;
>> +      this.value2 = value2;
>> +    }
>> +
>> +    public List<Feature> getPath() {
>> +      return path;
>> +    }
>> +
>> +    public Object getValue1() {
>> +      return value1;
>> +    }
>> +
>> +    public Object getValue2() {
>> +      return value2;
>> +    }
>> +  }
>> +}
>>
>


RE: svn commit: r1504269 - /ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/data/analysis/CompareFeatureStructures.java

Posted by "Chen, Pei" <Pe...@childrens.harvard.edu>.
Having a general Util to compare CAS feature structures would be so useful on so many levels.
Could we create a Jira for this so that it will help facilitate dev awareness of these utilities when we generate release notes?
I've been looking for something like this from the UIMA tools for a while...

> -----Original Message-----
> From: stevenbethard@apache.org [mailto:stevenbethard@apache.org]
> Sent: Wednesday, July 17, 2013 4:35 PM
> To: commits@ctakes.apache.org
> Subject: svn commit: r1504269 - /ctakes/trunk/ctakes-
> temporal/src/main/java/org/apache/ctakes/temporal/data/analysis/Compar
> eFeatureStructures.java
> 
> Author: stevenbethard
> Date: Wed Jul 17 20:34:26 2013
> New Revision: 1504269
> 
> URL: http://svn.apache.org/r1504269
> Log:
> Adds first draft of class to generate "diffs" of CAS feature structures.
> 
> Added:
>     ctakes/trunk/ctakes-
> temporal/src/main/java/org/apache/ctakes/temporal/data/analysis/Compar
> eFeatureStructures.java
> 
> Added: ctakes/trunk/ctakes-
> temporal/src/main/java/org/apache/ctakes/temporal/data/analysis/Compar
> eFeatureStructures.java
> URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-
> temporal/src/main/java/org/apache/ctakes/temporal/data/analysis/Compar
> eFeatureStructures.java?rev=1504269&view=auto
> ==========================================================
> ====================
> --- ctakes/trunk/ctakes-
> temporal/src/main/java/org/apache/ctakes/temporal/data/analysis/Compar
> eFeatureStructures.java (added)
> +++ ctakes/trunk/ctakes-
> temporal/src/main/java/org/apache/ctakes/tempora
> +++ l/data/analysis/CompareFeatureStructures.java Wed Jul 17 20:34:26
> +++ 2013
> @@ -0,0 +1,312 @@
> +package org.apache.ctakes.temporal.data.analysis;
> +
> +import java.io.File;
> +import java.io.FileInputStream;
> +import java.util.Arrays;
> +import java.util.Iterator;
> +import java.util.List;
> +import java.util.regex.Matcher;
> +import java.util.regex.Pattern;
> +
> +import javax.annotation.Nullable;
> +
> +import org.apache.uima.cas.CASException; import
> +org.apache.uima.cas.FSIterator; import org.apache.uima.cas.Feature;
> +import org.apache.uima.cas.FeatureStructure;
> +import org.apache.uima.cas.Type;
> +import org.apache.uima.cas.impl.XmiCasDeserializer;
> +import org.apache.uima.jcas.JCas;
> +import org.apache.uima.jcas.cas.FSArray; import
> +org.apache.uima.jcas.cas.NonEmptyFSList;
> +import org.apache.uima.jcas.tcas.Annotation;
> +import org.uimafit.factory.JCasFactory; import
> +org.uimafit.util.JCasUtil;
> +
> +import com.google.common.base.Function; import
> +com.google.common.base.Joiner; import
> com.google.common.base.Objects;
> +import com.google.common.collect.Iterables;
> +import com.google.common.collect.Lists; import
> +com.google.common.collect.Multimap;
> +import com.google.common.collect.Ordering;
> +import com.google.common.collect.TreeMultimap;
> +import com.lexicalscope.jewel.cli.CliFactory;
> +import com.lexicalscope.jewel.cli.Option;
> +
> +public class CompareFeatureStructures {
> +  static interface Options {
> +    @Option(longName = "dir1")
> +    public File getDirectory1();
> +
> +    @Option(longName = "dir2")
> +    public File getDirectory2();
> +
> +    @Option(longName = "roots", defaultValue = {
> +        "org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation",
> +        "org.apache.ctakes.typesystem.type.relation.Relation" })
> +    public List<String> getAnnotationClassNames();  }
> +
> +  public static void main(String[] args) throws Exception {
> +    Options options = CliFactory.parseArguments(Options.class, args);
> +    List<Class<?>> annotationClasses = Lists.newArrayList();
> +    for (String annotationClassName : options.getAnnotationClassNames()) {
> +      annotationClasses.add(Class.forName(annotationClassName));
> +    }
> +    File dir1 = options.getDirectory1();
> +    File dir2 = options.getDirectory2();
> +    if (!Arrays.equals(dir1.list(), dir2.list())) {
> +      System.err.printf("%s and %s contain different files", dir1, dir2);
> +    } else {
> +      for (String fileName : dir1.list()) {
> +        System.err.printf("== Checking %s ===\n", fileName);
> +        JCas jCas1 = readXMI(new File(dir1, fileName));
> +        JCas jCas2 = readXMI(new File(dir2, fileName));
> +        List<String> viewNames1 = getViewNames(jCas1);
> +        List<String> viewNames2 = getViewNames(jCas2);
> +        if (areEqual("view-names", viewNames1, viewNames2)) {
> +          for (String viewName : viewNames1) {
> +            JCas view1 = jCas1.getView(viewName);
> +            JCas view2 = jCas2.getView(viewName);
> +            for (Class<?> annotationClass : annotationClasses) {
> +              Multimap<Type, FeatureStructure> fsMap1 =
> toSortedMultimap(view1, annotationClass);
> +              Multimap<Type, FeatureStructure> fsMap2 =
> toSortedMultimap(view2, annotationClass);
> +              if (areEqual("annotation-counts", fsMap1.keys(), fsMap2.keys())) {
> +                for (Type type : fsMap1.keySet()) {
> +                  Iterator<FeatureStructure> fsIter1 = fsMap1.get(type).iterator();
> +                  Iterator<FeatureStructure> fsIter2 = fsMap2.get(type).iterator();
> +                  while (fsIter1.hasNext() && fsIter2.hasNext()) {
> +                    FeatureStructure fs1 = fsIter1.next();
> +                    FeatureStructure fs2 = fsIter2.next();
> +                    FSDiff diff = new FSDiff(fs1, fs2);
> +                    if (diff.hasDifferences()) {
> +                      System.err.println(diff);
> +                    }
> +                  }
> +                }
> +              }
> +            }
> +          }
> +        }
> +      }
> +    }
> +
> +  }
> +
> +  private static JCas readXMI(File xmiFile) throws Exception {
> +    JCas jCas = JCasFactory.createJCas();
> +    FileInputStream inputStream = new FileInputStream(xmiFile);
> +    try {
> +      XmiCasDeserializer.deserialize(inputStream, jCas.getCas());
> +    } finally {
> +      inputStream.close();
> +    }
> +    return jCas;
> +  }
> +
> +  private static List<String> getViewNames(JCas jCas) throws CASException {
> +    List<String> names = Lists.newArrayList();
> +    Iterator<JCas> views = jCas.getViewIterator();
> +    while (views.hasNext()) {
> +      names.add(views.next().getViewName());
> +    }
> +    return names;
> +  }
> +
> +  private static boolean areEqual(String name, Object o1, Object o2) {
> +    boolean areEqual = Objects.equal(o1, o2);
> +    if (!areEqual) {
> +      System.err.printf("Difference in %s:\n-%s\n+%s\n", name, o1, o2);
> +    }
> +    return areEqual;
> +  }
> +
> +  private static Multimap<Type, FeatureStructure> toSortedMultimap(
> +      JCas jCas,
> +      Class<?> annotationClass) {
> +    Type type = JCasUtil.getType(jCas, annotationClass);
> +    FSIterator<FeatureStructure> fsIterator =
> jCas.getFSIndexRepository().getAllIndexedFS(type);
> +    Multimap<Type, FeatureStructure> result =
> TreeMultimap.create(BY_NAME, BY_OFFSETS);
> +    while (fsIterator.hasNext()) {
> +      FeatureStructure fs = fsIterator.next();
> +      result.put(fs.getType(), fs);
> +    }
> +    return result;
> +  }
> +
> +  private static final Ordering<Type> BY_NAME =
> Ordering.natural().onResultOf(
> +      new Function<Type, String>() {
> +        @Override
> +        public String apply(@Nullable Type input) {
> +          return input.getName();
> +        }
> +      });
> +
> +  private static final Ordering<FeatureStructure> BY_OFFSETS =
> +      Ordering.natural().<Integer> lexicographical().onResultOf(
> +          new Function<FeatureStructure, Iterable<Integer>>() {
> +            @Override
> +            public Iterable<Integer> apply(@Nullable FeatureStructure input) {
> +              List<Integer> offsets = Lists.newArrayList();
> +              if (input != null) {
> +                if (input instanceof Annotation) {
> +                  Annotation annotation = (Annotation) input;
> +                  offsets.add(annotation.getBegin());
> +                  offsets.add(annotation.getEnd());
> +                } else if (input instanceof FSArray) {
> +                  FSArray fsArray = (FSArray) input;
> +                  for (int i = 0; i < fsArray.size(); ++i) {
> +                    Iterables.addAll(offsets, this.apply(fsArray.get(i)));
> +                  }
> +                } else if (input instanceof NonEmptyFSList) {
> +                  NonEmptyFSList fsList = (NonEmptyFSList) input;
> +                  Iterables.addAll(offsets, this.apply(fsList.getHead()));
> +                  Iterables.addAll(offsets, this.apply(fsList.getTail()));
> +                } else {
> +                  for (Feature feature : input.getType().getFeatures()) {
> +                    if (!feature.getRange().isPrimitive()) {
> +                      Iterables.addAll(offsets,
> this.apply(input.getFeatureValue(feature)));
> +                    }
> +                  }
> +                }
> +              }
> +              return offsets;
> +            }
> +          });
> +
> +  public static class FSDiff {
> +    private List<FSDifference> differences;
> +    private FeatureStructure root1, root2;
> +
> +    public FSDiff(FeatureStructure root1, FeatureStructure root2) {
> +      this.root1 = root1;
> +      this.root2 = root2;
> +      this.differences = Lists.newArrayList();
> +      this.findDifferences(
> +          this.root1,
> +          this.root2,
> +          Lists.<Feature> newArrayList(),
> +          Lists.<FeatureStructure> newArrayList());
> +    }
> +
> +    public boolean hasDifferences() {
> +      return !this.differences.isEmpty();
> +    }
> +
> +    @Override
> +    public String toString() {
> +      String diff;
> +      if (!this.hasDifferences()) {
> +        diff = "";
> +      } else {
> +        List<String> paths = Lists.newArrayList();
> +        for (FSDifference difference : this.differences) {
> +          List<String> featureNames = Lists.newArrayList();
> +          for (Feature feature : difference.getPath()) {
> +            featureNames.add(feature.getShortName());
> +          }
> +          paths.add(Joiner.on('/').join(featureNames));
> +        }
> +        diff = this.root1.toString();
> +        for (FSDifference difference : this.differences) {
> +          String value1 = difference.getValue1().toString().trim();
> +          String value2 = difference.getValue2().toString().trim();
> +          String value1space = value1.replaceAll("\\s+", "\\\\s+");
> +          Pattern pattern =
> +              Pattern.compile(String.format("^(.*?)(%s)", value1space),
> Pattern.MULTILINE);
> +          Matcher matcher = pattern.matcher(diff);
> +          StringBuffer buffer = new StringBuffer();
> +          while (matcher.find()) {
> +            String prefix = matcher.group(1);
> +            String replacement;
> +            // don't re-replace things that have already been taken care of
> +            if (prefix.startsWith("-") || prefix.startsWith("+")) {
> +              replacement = matcher.group();
> +            }
> +            // replace the current text with diff-style +/- text
> +            else {
> +              Matcher indentMatcher = Pattern.compile("^\\s*").matcher(prefix);
> +              indentMatcher.find();
> +              String indent = indentMatcher.group();
> +              replacement =
> +                  String.format(
> +                      "%s%s\n%s%s",
> +                      "-" + prefix,
> +                      value1.replaceAll("\n", "\n-" + indent),
> +                      "+" + prefix,
> +                      value2.replaceAll("\n", "\n+" + indent));
> +            }
> +            matcher.appendReplacement(buffer, replacement);
> +          }
> +          matcher.appendTail(buffer);
> +          diff = buffer.toString();
> +        }
> +        diff = diff.replaceAll("(?m)^(?![+-])", " ");
> +        diff = String.format("Difference in %s:\n%s", paths, diff);
> +      }
> +      return diff;
> +    }
> +
> +    private void findDifferences(
> +        FeatureStructure fs1,
> +        FeatureStructure fs2,
> +        List<Feature> featurePath,
> +        List<FeatureStructure> seen) {
> +      if (!seen.contains(fs1) && !seen.contains(fs2)) {
> +        seen.add(fs1);
> +        seen.add(fs2);
> +        for (Feature feature : fs1.getType().getFeatures()) {
> +          if (feature.getName().equals("uima.cas.AnnotationBase:sofa")) {
> +            continue;
> +          }
> +          List<Feature> newPath = Lists.newArrayList(featurePath);
> +          newPath.add(feature);
> +          if (feature.getRange().isPrimitive()) {
> +            String value1 = fs1.getFeatureValueAsString(feature);
> +            String value2 = fs2.getFeatureValueAsString(feature);
> +            if (!Objects.equal(value1, value2)) {
> +              this.differences.add(new FSDifference(newPath, value1, value2));
> +            }
> +          } else {
> +            FeatureStructure value1 = fs1.getFeatureValue(feature);
> +            FeatureStructure value2 = fs2.getFeatureValue(feature);
> +            if (value1 == null
> +                || value2 == null
> +                ||
> !value1.getType().getName().equals(value2.getType().getName())) {
> +              if (!Objects.equal(value1, value2)) {
> +                this.differences.add(new FSDifference(newPath, value1, value2));
> +              }
> +            } else {
> +              this.findDifferences(value1, value2, newPath, seen);
> +            }
> +          }
> +        }
> +      }
> +    }
> +  }
> +
> +  public static class FSDifference {
> +
> +    private List<Feature> path;
> +    private Object value1, value2;
> +
> +    public FSDifference(List<Feature> path, Object value1, Object value2) {
> +      this.path = path;
> +      this.value1 = value1;
> +      this.value2 = value2;
> +    }
> +
> +    public List<Feature> getPath() {
> +      return path;
> +    }
> +
> +    public Object getValue1() {
> +      return value1;
> +    }
> +
> +    public Object getValue2() {
> +      return value2;
> +    }
> +  }
> +}
>