You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@uima.apache.org by pk...@apache.org on 2015/03/21 18:54:22 UTC

svn commit: r1668310 - in /uima/ruta/trunk/ruta-core/src: main/java/org/apache/uima/ruta/engine/ main/resources/org/apache/uima/ruta/engine/ test/java/org/apache/uima/ruta/engine/

Author: pkluegl
Date: Sat Mar 21 17:54:22 2015
New Revision: 1668310

URL: http://svn.apache.org/r1668310
Log:
UIMA-4286
- added parameter gapInducingTags
- added parameter gapText
- added feature expandedOffsets
- added test

Added:
    uima/ruta/trunk/ruta-core/src/test/java/org/apache/uima/ruta/engine/HtmlConverterXmlTest.java
Modified:
    uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/engine/HtmlConverter.java
    uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/engine/HtmlConverterVisitor.java
    uima/ruta/trunk/ruta-core/src/main/resources/org/apache/uima/ruta/engine/HtmlTypeSystem.xml

Modified: uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/engine/HtmlConverter.java
URL: http://svn.apache.org/viewvc/uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/engine/HtmlConverter.java?rev=1668310&r1=1668309&r2=1668310&view=diff
==============================================================================
--- uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/engine/HtmlConverter.java (original)
+++ uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/engine/HtmlConverter.java Sat Mar 21 17:54:22 2015
@@ -33,6 +33,8 @@ import org.apache.uima.UimaContext;
 import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
 import org.apache.uima.cas.CAS;
 import org.apache.uima.cas.CASException;
+import org.apache.uima.cas.FSIterator;
+import org.apache.uima.cas.Feature;
 import org.apache.uima.cas.Type;
 import org.apache.uima.cas.TypeSystem;
 import org.apache.uima.cas.text.AnnotationIndex;
@@ -71,10 +73,10 @@ public class HtmlConverter extends JCasA
 
   public static final String NAMESPACE = "org.apache.uima.ruta.type.html.";
 
-  private static final String DEFAULT_MODIFIED_VIEW = "plaintext";
+  public static final String DEFAULT_MODIFIED_VIEW = "plaintext";
 
   public static final String LINEBREAK = "\n";
-  
+
   /**
    * This string parameter specifies the name of the new view. The default value is
    * <code>plaintext</code>.
@@ -114,14 +116,23 @@ public class HtmlConverter extends JCasA
   private Boolean skipWhitespaces;
 
   /**
-   * TODO
+   * If this boolean parameter is set to true, then the tags of the complete document is processed
+   * and not only thos within the body tag.
    */
   public static final String PARAM_PROCESS_ALL = "processAll";
 
   @ConfigurationParameter(name = PARAM_PROCESS_ALL, mandatory = false, defaultValue = "false")
   private Boolean processAll;
 
-  
+  /**
+   * If this boolean parameter is set to true, then the tags of the complete document is processed
+   * and not only those tags within the body tag.
+   */
+  public static final String PARAM_EXPAND_OFFSETS = "expandOffsets";
+
+  @ConfigurationParameter(name = PARAM_EXPAND_OFFSETS, mandatory = false, defaultValue = "false")
+  private Boolean expandOffsets;
+
   /**
    * This string parameter determines the character sequence that replaces a linebreak. The default
    * behavior is the empty string.
@@ -142,6 +153,24 @@ public class HtmlConverter extends JCasA
   private String[] newlineInducingTags;
 
   /**
+   * This string array parameter sets the names of the html tags that create additional text in the
+   * output view. The acutal string of the gap is defined by the parameter <code>gapText</code>.
+   */
+  public static final String PARAM_GAP_INDUCING_TAGS = "gapInducingTags";
+
+  @ConfigurationParameter(name = PARAM_GAP_INDUCING_TAGS, mandatory = false)
+  private String[] gapInducingTags;
+
+  /**
+   * This string parameter determines the character sequence that is introduced by the html tags
+   * specified in the <code>gapInducingTags</code>.
+   */
+  public static final String PARAM_GAP_TEXT = "apText";
+
+  @ConfigurationParameter(name = PARAM_GAP_TEXT, mandatory = false, defaultValue = "")
+  private String gapText;
+
+  /**
    * This string array parameter can be used to apply custom conversions. It defaults to a list of
    * commonly used codes, e.g., <![CDATA[&nbsp;]]>, which are converted using html 4 entity
    * unescaping. However, explicit conversion strings can also be passed via the parameter
@@ -169,18 +198,16 @@ public class HtmlConverter extends JCasA
 
   /**
    * This string array parameter corresponds to <code>conversionPatterns</code> such that
-   * <code>conversionPatterns[i]</code> will be replaced by
-   * <code>conversionReplacements[i]</code>; replacements should be shorter than the source
-   * pattern. Per default, the replacement strings are computed using Html4 decoding. Remember to
-   * enable explicit conversion via <code>conversionPolicy</code> first.
+   * <code>conversionPatterns[i]</code> will be replaced by <code>conversionReplacements[i]</code>;
+   * replacements should be shorter than the source pattern. Per default, the replacement strings
+   * are computed using Html4 decoding. Remember to enable explicit conversion via
+   * <code>conversionPolicy</code> first.
    */
   public static final String PARAM_CONVERSION_REPLACEMENTS = "conversionReplacements";
 
   @ConfigurationParameter(name = PARAM_CONVERSION_REPLACEMENTS, mandatory = false)
   private String[] conversionReplacements;
 
-
-
   private int[] map;
 
   @Override
@@ -225,7 +252,8 @@ public class HtmlConverter extends JCasA
           "&lt;", "&gt;", "&apos;", "&sect;", "&uml;", "&copy;", "&trade;", "&reg;", "&ouml;",
           "&auml;", "&uuml;", "&#160;" };
     }
-    conversionReplacements = (String[]) aContext.getConfigParameterValue(PARAM_CONVERSION_REPLACEMENTS);
+    conversionReplacements = (String[]) aContext
+            .getConfigParameterValue(PARAM_CONVERSION_REPLACEMENTS);
     if (conversionReplacements == null) {
       conversionReplacements = new String[conversionPatterns.length];
       for (int i = 0; i < conversionPatterns.length; i++) {
@@ -273,15 +301,18 @@ public class HtmlConverter extends JCasA
     }
     SortedSet<HtmlConverterPSpan> visibleSpansSoFar = new TreeSet<HtmlConverterPSpan>();
     SortedSet<HtmlConverterPSpan> linebreaksFromHtmlTags = new TreeSet<HtmlConverterPSpan>();
+    SortedSet<HtmlConverterPSpan> gapsFromHtmlTags = new TreeSet<HtmlConverterPSpan>();
 
     // process
     try {
       Parser parser = new Parser(documentText);
       NodeList list = parser.parse(null);
-      HtmlConverterVisitor visitor = new HtmlConverterVisitor(newlineInducingTags, skipWhitespaces, processAll);
+      HtmlConverterVisitor visitor = new HtmlConverterVisitor(newlineInducingTags, gapInducingTags,
+              gapText, skipWhitespaces, processAll);
       list.visitAllNodesWith(visitor);
       visibleSpansSoFar = visitor.getTextSpans();
       linebreaksFromHtmlTags = visitor.getLinebreaksFromHtmlTags();
+      gapsFromHtmlTags = visitor.getGapsFromHtmlTags();
     } catch (ParserException e) {
       throw new AnalysisEngineProcessException(e);
     }
@@ -298,6 +329,7 @@ public class HtmlConverter extends JCasA
       }
     }
     visibleSpansSoFar.addAll(linebreaksFromHtmlTags);
+    visibleSpansSoFar.addAll(gapsFromHtmlTags);
 
     // create new doc-text and the map from deletions and visible-text-spans:
     StringBuffer sbu = new StringBuffer(documentText.length());
@@ -352,6 +384,7 @@ public class HtmlConverter extends JCasA
     JCas modview = fromJcas.getView(toView);
 
     Set<Annotation> indexedFs = new HashSet<Annotation>();
+    Set<Annotation> toExpand = new HashSet<Annotation>();
     AnnotationIndex<Annotation> annotationIndex = fromJcas.getAnnotationIndex();
     TypeSystem typeSystem = fromJcas.getTypeSystem();
     Type docType = typeSystem.getType(UIMAConstants.TYPE_DOCUMENT);
@@ -383,8 +416,56 @@ public class HtmlConverter extends JCasA
             getContext().getLogger().log(Level.WARNING, "illegal annotation offset mapping");
           }
         }
+      } else if (expandOffsets) {
+        clone.setBegin(mappedBegin);
+        clone.setEnd(mappedEnd);
+        toExpand.add(clone);
+      }
+    }
+
+    for (Annotation each : toExpand) {
+      Annotation nextBestAnnotation = getNextBestAnnotation(each, modview);
+      if (nextBestAnnotation != null) {
+        each.setBegin(nextBestAnnotation.getBegin());
+        each.setEnd(nextBestAnnotation.getEnd());
+        Feature expandedOffsetsFeature = each.getType().getFeatureByBaseName("expandedOffsets");
+        if (expandedOffsetsFeature != null) {
+          each.setBooleanValue(expandedOffsetsFeature, true);
+        }
+        modview.addFsToIndexes(each);
+      }
+    }
+  }
+
+  private Annotation getNextBestAnnotation(Annotation source, JCas jcas) {
+
+    FSIterator<Annotation> iterator = jcas.getAnnotationIndex().iterator(source);
+    Annotation best = null;
+    if (iterator.isValid()) {
+      Annotation annotation = iterator.get();
+      best = annotation;
+    } else {
+      Annotation dummy = new Annotation(jcas, source.getBegin(), source.getBegin() +1);
+      iterator = jcas.getAnnotationIndex().iterator(dummy);
+      if(!iterator.isValid()) {
+        if((jcas.getDocumentText().length()/ 2) > source.getBegin()) {
+          iterator.moveToFirst();
+          if (iterator.isValid()) {
+            Annotation annotation = iterator.get();
+            best = annotation;
+          }
+        } else {
+          iterator.moveToLast();
+          if (iterator.isValid()) {
+            Annotation annotation = iterator.get();
+            best = annotation;
+          }
+        }
       }
     }
+    
+    
+    return best;
   }
 
   private SortedSet<HtmlConverterPSpan> handleLinebreaksInDocumentText(

Modified: uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/engine/HtmlConverterVisitor.java
URL: http://svn.apache.org/viewvc/uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/engine/HtmlConverterVisitor.java?rev=1668310&r1=1668309&r2=1668310&view=diff
==============================================================================
--- uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/engine/HtmlConverterVisitor.java (original)
+++ uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/engine/HtmlConverterVisitor.java Sat Mar 21 17:54:22 2015
@@ -21,6 +21,7 @@ package org.apache.uima.ruta.engine;
 
 import java.util.Arrays;
 import java.util.Collection;
+import java.util.List;
 import java.util.SortedSet;
 import java.util.TreeSet;
 
@@ -43,12 +44,25 @@ public class HtmlConverterVisitor extend
 
   private SortedSet<HtmlConverterPSpan> linebreaksFromHtmlTags = new TreeSet<HtmlConverterPSpan>();
 
+  private SortedSet<HtmlConverterPSpan> gapsFromHtmlTags = new TreeSet<HtmlConverterPSpan>();
+
   private Collection<String> newlineInducingTags;
 
   private boolean processAll = true;
 
-  public HtmlConverterVisitor(String[] newlineInducingTags, boolean skipWhitespace, boolean processAll) {
-    this.newlineInducingTags = Arrays.asList(newlineInducingTags);
+  private List<String> gapInducingTags;
+
+  private String gapText;
+
+
+  public HtmlConverterVisitor(String[] newlineInducingTags,String[] gapInducingTags, String gapText, boolean skipWhitespace, boolean processAll) {
+    if(newlineInducingTags != null) {
+      this.newlineInducingTags = Arrays.asList(newlineInducingTags);
+    }
+    if(gapInducingTags != null) {
+      this.gapInducingTags = Arrays.asList(gapInducingTags);
+    }
+    this.gapText = gapText;
     this.skipWhitespace = skipWhitespace;
     this.processAll = processAll;
   }
@@ -68,15 +82,20 @@ public class HtmlConverterVisitor extend
     super.visitTag(tag);
     String trimmedTagnameLowercase = tag.getTagName().toLowerCase().trim();
     if (trimmedTagnameLowercase.equals("body")) {
-      this.inBody = true;
+      inBody = true;
     } else if (trimmedTagnameLowercase.equals("script")) {
-      this.inScript = true;
+      inScript = true;
     }
-    if (this.newlineInducingTags.contains(trimmedTagnameLowercase)) {
+    if (newlineInducingTags != null && newlineInducingTags.contains(trimmedTagnameLowercase)) {
       int begin = tag.getStartPosition();
-      this.linebreaksFromHtmlTags.add(new HtmlConverterPSpanReplacement(begin, begin + 1,
+      linebreaksFromHtmlTags.add(new HtmlConverterPSpanReplacement(begin, begin + 1,
               HtmlConverter.LINEBREAK));
     }
+    if (gapInducingTags != null && gapInducingTags.contains(trimmedTagnameLowercase)) {
+      int begin = tag.getStartPosition();
+      gapsFromHtmlTags.add(new HtmlConverterPSpanReplacement(begin, begin+gapText.length(),
+              gapText));
+    }
   }
 
   @Override
@@ -96,4 +115,8 @@ public class HtmlConverterVisitor extend
   public SortedSet<HtmlConverterPSpan> getLinebreaksFromHtmlTags() {
     return linebreaksFromHtmlTags;
   }
+  
+  public SortedSet<HtmlConverterPSpan> getGapsFromHtmlTags() {
+    return gapsFromHtmlTags;
+  }
 }
\ No newline at end of file

Modified: uima/ruta/trunk/ruta-core/src/main/resources/org/apache/uima/ruta/engine/HtmlTypeSystem.xml
URL: http://svn.apache.org/viewvc/uima/ruta/trunk/ruta-core/src/main/resources/org/apache/uima/ruta/engine/HtmlTypeSystem.xml?rev=1668310&r1=1668309&r2=1668310&view=diff
==============================================================================
--- uima/ruta/trunk/ruta-core/src/main/resources/org/apache/uima/ruta/engine/HtmlTypeSystem.xml (original)
+++ uima/ruta/trunk/ruta-core/src/main/resources/org/apache/uima/ruta/engine/HtmlTypeSystem.xml Sat Mar 21 17:54:22 2015
@@ -35,6 +35,11 @@
           <rangeTypeName>uima.cas.String</rangeTypeName>
         </featureDescription>
         <featureDescription>
+          <name>expandedOffsets</name>
+          <description></description>
+          <rangeTypeName>uima.cas.Boolean</rangeTypeName>
+        </featureDescription>
+        <featureDescription>
           <name>attributeName</name>
           <description></description>
           <rangeTypeName>uima.cas.StringArray</rangeTypeName>

Added: uima/ruta/trunk/ruta-core/src/test/java/org/apache/uima/ruta/engine/HtmlConverterXmlTest.java
URL: http://svn.apache.org/viewvc/uima/ruta/trunk/ruta-core/src/test/java/org/apache/uima/ruta/engine/HtmlConverterXmlTest.java?rev=1668310&view=auto
==============================================================================
--- uima/ruta/trunk/ruta-core/src/test/java/org/apache/uima/ruta/engine/HtmlConverterXmlTest.java (added)
+++ uima/ruta/trunk/ruta-core/src/test/java/org/apache/uima/ruta/engine/HtmlConverterXmlTest.java Sat Mar 21 17:54:22 2015
@@ -0,0 +1,167 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.uima.ruta.engine;
+
+import static org.junit.Assert.assertEquals;
+
+import java.net.URL;
+
+import org.apache.uima.UIMAFramework;
+import org.apache.uima.analysis_engine.AnalysisEngine;
+import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.FSIterator;
+import org.apache.uima.cas.Feature;
+import org.apache.uima.cas.Type;
+import org.apache.uima.cas.text.AnnotationFS;
+import org.apache.uima.cas.text.AnnotationIndex;
+import org.apache.uima.resource.ResourceSpecifier;
+import org.apache.uima.util.XMLInputSource;
+import org.junit.Test;
+
+public class HtmlConverterXmlTest {
+
+  @Test
+  public void test() throws Exception {
+    String html = "<Parent>\n";
+    html += "<Child1>Some content</Child1>\n";
+    html += "<Child2 attribute=“someValue” />\n";
+    html += "<Child3>More content.</Child3>\n";
+    html += "</Parent>\n";
+
+    URL urlA = HtmlAnnotator.class.getClassLoader().getResource("HtmlAnnotator.xml");
+    if (urlA == null) {
+      urlA = HtmlAnnotator.class.getClassLoader().getResource(
+              "org/apache/uima/ruta/engine/HtmlAnnotator.xml");
+    }
+
+    URL urlC = HtmlAnnotator.class.getClassLoader().getResource("HtmlConverter.xml");
+    if (urlC == null) {
+      urlC = HtmlAnnotator.class.getClassLoader().getResource(
+              "org/apache/uima/ruta/engine/HtmlConverter.xml");
+    }
+
+    XMLInputSource inA = new XMLInputSource(urlA);
+    ResourceSpecifier specifierA = UIMAFramework.getXMLParser().parseResourceSpecifier(inA);
+    AnalysisEngine aeA = UIMAFramework.produceAnalysisEngine(specifierA);
+    aeA.setConfigParameterValue(HtmlAnnotator.PARAM_ONLY_CONTENT, false);
+    aeA.reconfigure();
+    
+    XMLInputSource inC = new XMLInputSource(urlC);
+    ResourceSpecifier specifierC = UIMAFramework.getXMLParser().parseResourceSpecifier(inC);
+    AnalysisEngine aeC = UIMAFramework.produceAnalysisEngine(specifierC);
+    aeC.setConfigParameterValue(HtmlConverter.PARAM_SKIP_WHITESPACES, false);
+    aeC.setConfigParameterValue(HtmlConverter.PARAM_PROCESS_ALL, true);
+    aeC.setConfigParameterValue(HtmlConverter.PARAM_GAP_INDUCING_TAGS, new String[] { "child1",
+        "child2", "child3" });
+    aeC.setConfigParameterValue(HtmlConverter.PARAM_GAP_TEXT, "$");
+    aeC.reconfigure();
+
+    CAS cas = aeA.newCAS();
+    Type tagType = cas.getTypeSystem().getType(HtmlAnnotator.NAMESPACE + "TAG");
+    AnnotationIndex<AnnotationFS> ai = null;
+    FSIterator<AnnotationFS> iterator = null;
+
+    cas.setDocumentText(html);
+    aeA.process(cas);
+    aeC.process(cas);
+
+    CAS plainTextCas = cas.getView(HtmlConverter.DEFAULT_MODIFIED_VIEW);
+
+    assertEquals("$Some content$$More content.", plainTextCas.getDocumentText());
+
+    ai = plainTextCas.getAnnotationIndex(tagType);
+    iterator = ai.iterator();
+    assertEquals(4, ai.size());
+    assertEquals("$Some content$$More content.", iterator.next().getCoveredText());
+    assertEquals("$Some content", iterator.next().getCoveredText());
+    assertEquals("$", iterator.next().getCoveredText());
+    assertEquals("$More content.", iterator.next().getCoveredText());
+    
+    cas.release();
+  }
+  
+  @Test
+  public void testExpandOffsets() throws Exception {
+    String html = "<Parent>\n";
+    html += "<Child1>Some content</Child1>\n";
+    html += "<Child2 attribute=“someValue” />\n";
+    html += "<Child3>More content.</Child3>\n";
+    html += "</Parent>\n";
+
+    URL urlA = HtmlAnnotator.class.getClassLoader().getResource("HtmlAnnotator.xml");
+    if (urlA == null) {
+      urlA = HtmlAnnotator.class.getClassLoader().getResource(
+              "org/apache/uima/ruta/engine/HtmlAnnotator.xml");
+    }
+
+    URL urlC = HtmlAnnotator.class.getClassLoader().getResource("HtmlConverter.xml");
+    if (urlC == null) {
+      urlC = HtmlAnnotator.class.getClassLoader().getResource(
+              "org/apache/uima/ruta/engine/HtmlConverter.xml");
+    }
+
+    XMLInputSource inA = new XMLInputSource(urlA);
+    ResourceSpecifier specifierA = UIMAFramework.getXMLParser().parseResourceSpecifier(inA);
+    AnalysisEngine aeA = UIMAFramework.produceAnalysisEngine(specifierA);
+    aeA.setConfigParameterValue(HtmlAnnotator.PARAM_ONLY_CONTENT, false);
+    aeA.reconfigure();
+    
+    XMLInputSource inC = new XMLInputSource(urlC);
+    ResourceSpecifier specifierC = UIMAFramework.getXMLParser().parseResourceSpecifier(inC);
+    AnalysisEngine aeC = UIMAFramework.produceAnalysisEngine(specifierC);
+    aeC.setConfigParameterValue(HtmlConverter.PARAM_SKIP_WHITESPACES, false);
+    aeC.setConfigParameterValue(HtmlConverter.PARAM_PROCESS_ALL, true);
+    aeC.setConfigParameterValue(HtmlConverter.PARAM_EXPAND_OFFSETS, true);
+    aeC.reconfigure();
+
+    CAS cas = aeA.newCAS();
+    Type tagType = cas.getTypeSystem().getType(HtmlAnnotator.NAMESPACE + "TAG");
+    Feature expandedFeature = tagType.getFeatureByBaseName("expandedOffsets");
+    AnnotationIndex<AnnotationFS> ai = null;
+    FSIterator<AnnotationFS> iterator = null;
+
+    cas.setDocumentText(html);
+    aeA.process(cas);
+    aeC.process(cas);
+
+    CAS plainTextCas = cas.getView(HtmlConverter.DEFAULT_MODIFIED_VIEW);
+
+    assertEquals("Some contentMore content.", plainTextCas.getDocumentText());
+
+    ai = plainTextCas.getAnnotationIndex(tagType);
+    iterator = ai.iterator();
+    assertEquals(4, ai.size());
+    AnnotationFS next = null ;
+    next = iterator.next();
+    assertEquals(false, next.getBooleanValue(expandedFeature));
+    assertEquals("Some contentMore content.", next.getCoveredText());
+    next = iterator.next();
+    assertEquals(false, next.getBooleanValue(expandedFeature));
+    assertEquals("Some content", next.getCoveredText());
+    next = iterator.next();
+    assertEquals(false, next.getBooleanValue(expandedFeature));
+    assertEquals("More content.", next.getCoveredText());
+    next = iterator.next();
+    assertEquals(true, next.getBooleanValue(expandedFeature));
+    assertEquals("More content.", next.getCoveredText());
+    
+    cas.release();
+  }
+}