You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@uima.apache.org by pk...@apache.org on 2015/03/25 12:28:13 UTC

svn commit: r1669095 - in /uima/ruta/trunk/ruta-core/src/main: java/org/apache/uima/ruta/engine/HtmlConverter.java java/org/apache/uima/ruta/engine/HtmlConverterVisitor.java resources/org/apache/uima/ruta/engine/HtmlConverter.xml

Author: pkluegl
Date: Wed Mar 25 11:28:12 2015
New Revision: 1669095

URL: http://svn.apache.org/r1669095
Log:
UIMA-4305
- added parameter

Modified:
    uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/engine/HtmlConverter.java
    uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/engine/HtmlConverterVisitor.java
    uima/ruta/trunk/ruta-core/src/main/resources/org/apache/uima/ruta/engine/HtmlConverter.xml

Modified: uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/engine/HtmlConverter.java
URL: http://svn.apache.org/viewvc/uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/engine/HtmlConverter.java?rev=1669095&r1=1669094&r2=1669095&view=diff
==============================================================================
--- uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/engine/HtmlConverter.java (original)
+++ uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/engine/HtmlConverter.java Wed Mar 25 11:28:12 2015
@@ -154,6 +154,16 @@ public class HtmlConverter extends JCasA
   private String[] newlineInducingTags;
 
   /**
+   * This string parameter contains a regular expression for HTML/XML elements. If the pattern
+   * matches, then the element will introduce a new line break similar to the element of the
+   * parameter <code>newlineInducingTags</code>.
+   */
+  public static final String PARAM_NEWLINE_INDUCING_TAG_REGEXP = "newlineInducingTagRegExp";
+
+  @ConfigurationParameter(name = PARAM_NEWLINE_INDUCING_TAG_REGEXP, mandatory = false)
+  private String newlineInducingTagRegExp;
+
+  /**
    * This string array parameter sets the names of the html tags that create additional text in the
    * output view. The acutal string of the gap is defined by the parameter <code>gapText</code>.
    */
@@ -308,7 +318,7 @@ public class HtmlConverter extends JCasA
     try {
       Parser parser = new Parser(documentText);
       NodeList list = parser.parse(null);
-      HtmlConverterVisitor visitor = new HtmlConverterVisitor(newlineInducingTags, gapInducingTags,
+      HtmlConverterVisitor visitor = new HtmlConverterVisitor(newlineInducingTags, newlineInducingTagRegExp, gapInducingTags,
               gapText, skipWhitespaces, processAll);
       list.visitAllNodesWith(visitor);
       visibleSpansSoFar = visitor.getTextSpans();

Modified: uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/engine/HtmlConverterVisitor.java
URL: http://svn.apache.org/viewvc/uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/engine/HtmlConverterVisitor.java?rev=1669095&r1=1669094&r2=1669095&view=diff
==============================================================================
--- uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/engine/HtmlConverterVisitor.java (original)
+++ uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/engine/HtmlConverterVisitor.java Wed Mar 25 11:28:12 2015
@@ -24,6 +24,8 @@ import java.util.Collection;
 import java.util.List;
 import java.util.SortedSet;
 import java.util.TreeSet;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 
 import org.apache.commons.lang3.StringUtils;
 import org.htmlparser.Tag;
@@ -31,7 +33,6 @@ import org.htmlparser.Text;
 import org.htmlparser.tags.ScriptTag;
 import org.htmlparser.visitors.TextExtractingVisitor;
 
-
 public class HtmlConverterVisitor extends TextExtractingVisitor {
 
   private boolean inBody = false;
@@ -54,23 +55,29 @@ public class HtmlConverterVisitor extend
 
   private String gapText;
 
+  private Pattern newlineInducingTagPattern;
 
-  public HtmlConverterVisitor(String[] newlineInducingTags,String[] gapInducingTags, String gapText, boolean skipWhitespace, boolean processAll) {
-    if(newlineInducingTags != null) {
+  public HtmlConverterVisitor(String[] newlineInducingTags, String newlineInducingTagRegExp,
+          String[] gapInducingTags, String gapText, boolean skipWhitespace, boolean processAll) {
+    if (newlineInducingTags != null) {
       this.newlineInducingTags = Arrays.asList(newlineInducingTags);
     }
-    if(gapInducingTags != null) {
+    if (gapInducingTags != null) {
       this.gapInducingTags = Arrays.asList(gapInducingTags);
     }
     this.gapText = gapText;
     this.skipWhitespace = skipWhitespace;
     this.processAll = processAll;
+    if (newlineInducingTagRegExp != null) {
+      newlineInducingTagPattern = Pattern.compile(newlineInducingTagRegExp);
+    }
   }
 
   @Override
   public void visitStringNode(Text node) {
     super.visitStringNode(node);
-    if ((processAll || this.inBody) && !this.inScript && (!skipWhitespace || !StringUtils.isBlank(node.getText()))) {
+    if ((processAll || this.inBody) && !this.inScript
+            && (!skipWhitespace || !StringUtils.isBlank(node.getText()))) {
       int from = node.getStartPosition();
       int to = node.getEndPosition();
       textSpans.add(new HtmlConverterPSpan(from, to, node.getText()));
@@ -86,14 +93,23 @@ public class HtmlConverterVisitor extend
     } else if (trimmedTagnameLowercase.equals("script")) {
       inScript = true;
     }
-    if (newlineInducingTags != null && newlineInducingTags.contains(trimmedTagnameLowercase)) {
+    boolean matchedByPattern = false;
+    if (newlineInducingTagPattern != null) {
+      Matcher matcher = newlineInducingTagPattern.matcher(trimmedTagnameLowercase);
+      if (matcher.matches()) {
+        matchedByPattern = true;
+      }
+    }
+    if (matchedByPattern
+            || (newlineInducingTags != null && newlineInducingTags
+                    .contains(trimmedTagnameLowercase))) {
       int begin = tag.getStartPosition();
       linebreaksFromHtmlTags.add(new HtmlConverterPSpanReplacement(begin, begin + 1,
               HtmlConverter.LINEBREAK));
     }
     if (gapInducingTags != null && gapInducingTags.contains(trimmedTagnameLowercase)) {
       int begin = tag.getStartPosition();
-      gapsFromHtmlTags.add(new HtmlConverterPSpanReplacement(begin, begin+gapText.length(),
+      gapsFromHtmlTags.add(new HtmlConverterPSpanReplacement(begin, begin + gapText.length(),
               gapText));
     }
   }
@@ -115,7 +131,7 @@ public class HtmlConverterVisitor extend
   public SortedSet<HtmlConverterPSpan> getLinebreaksFromHtmlTags() {
     return linebreaksFromHtmlTags;
   }
-  
+
   public SortedSet<HtmlConverterPSpan> getGapsFromHtmlTags() {
     return gapsFromHtmlTags;
   }

Modified: uima/ruta/trunk/ruta-core/src/main/resources/org/apache/uima/ruta/engine/HtmlConverter.xml
URL: http://svn.apache.org/viewvc/uima/ruta/trunk/ruta-core/src/main/resources/org/apache/uima/ruta/engine/HtmlConverter.xml?rev=1669095&r1=1669094&r2=1669095&view=diff
==============================================================================
--- uima/ruta/trunk/ruta-core/src/main/resources/org/apache/uima/ruta/engine/HtmlConverter.xml (original)
+++ uima/ruta/trunk/ruta-core/src/main/resources/org/apache/uima/ruta/engine/HtmlConverter.xml Wed Mar 25 11:28:12 2015
@@ -40,6 +40,12 @@
         <mandatory>false</mandatory>
       </configurationParameter>
       <configurationParameter>
+        <name>newlineInducingTagRegExp</name>
+        <type>String</type>
+        <multiValued>false</multiValued>
+        <mandatory>false</mandatory>
+      </configurationParameter>
+      <configurationParameter>
         <name>replaceLinebreaks</name>
         <type>Boolean</type>
         <multiValued>false</multiValued>