You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@uima.apache.org by pk...@apache.org on 2015/03/25 12:28:13 UTC
svn commit: r1669095 - in /uima/ruta/trunk/ruta-core/src/main:
java/org/apache/uima/ruta/engine/HtmlConverter.java
java/org/apache/uima/ruta/engine/HtmlConverterVisitor.java
resources/org/apache/uima/ruta/engine/HtmlConverter.xml
Author: pkluegl
Date: Wed Mar 25 11:28:12 2015
New Revision: 1669095
URL: http://svn.apache.org/r1669095
Log:
UIMA-4305
- added parameter
Modified:
uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/engine/HtmlConverter.java
uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/engine/HtmlConverterVisitor.java
uima/ruta/trunk/ruta-core/src/main/resources/org/apache/uima/ruta/engine/HtmlConverter.xml
Modified: uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/engine/HtmlConverter.java
URL: http://svn.apache.org/viewvc/uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/engine/HtmlConverter.java?rev=1669095&r1=1669094&r2=1669095&view=diff
==============================================================================
--- uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/engine/HtmlConverter.java (original)
+++ uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/engine/HtmlConverter.java Wed Mar 25 11:28:12 2015
@@ -154,6 +154,16 @@ public class HtmlConverter extends JCasA
private String[] newlineInducingTags;
/**
+ * This string parameter contains a regular expression for HTML/XML elements. If the pattern
+ * matches, then the element will introduce a new line break similar to the element of the
+ * parameter <code>newlineInducingTags</code>.
+ */
+ public static final String PARAM_NEWLINE_INDUCING_TAG_REGEXP = "newlineInducingTagRegExp";
+
+ @ConfigurationParameter(name = PARAM_NEWLINE_INDUCING_TAG_REGEXP, mandatory = false)
+ private String newlineInducingTagRegExp;
+
+ /**
* This string array parameter sets the names of the html tags that create additional text in the
* output view. The acutal string of the gap is defined by the parameter <code>gapText</code>.
*/
@@ -308,7 +318,7 @@ public class HtmlConverter extends JCasA
try {
Parser parser = new Parser(documentText);
NodeList list = parser.parse(null);
- HtmlConverterVisitor visitor = new HtmlConverterVisitor(newlineInducingTags, gapInducingTags,
+ HtmlConverterVisitor visitor = new HtmlConverterVisitor(newlineInducingTags, newlineInducingTagRegExp, gapInducingTags,
gapText, skipWhitespaces, processAll);
list.visitAllNodesWith(visitor);
visibleSpansSoFar = visitor.getTextSpans();
Modified: uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/engine/HtmlConverterVisitor.java
URL: http://svn.apache.org/viewvc/uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/engine/HtmlConverterVisitor.java?rev=1669095&r1=1669094&r2=1669095&view=diff
==============================================================================
--- uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/engine/HtmlConverterVisitor.java (original)
+++ uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/engine/HtmlConverterVisitor.java Wed Mar 25 11:28:12 2015
@@ -24,6 +24,8 @@ import java.util.Collection;
import java.util.List;
import java.util.SortedSet;
import java.util.TreeSet;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
import org.apache.commons.lang3.StringUtils;
import org.htmlparser.Tag;
@@ -31,7 +33,6 @@ import org.htmlparser.Text;
import org.htmlparser.tags.ScriptTag;
import org.htmlparser.visitors.TextExtractingVisitor;
-
public class HtmlConverterVisitor extends TextExtractingVisitor {
private boolean inBody = false;
@@ -54,23 +55,29 @@ public class HtmlConverterVisitor extend
private String gapText;
+ private Pattern newlineInducingTagPattern;
- public HtmlConverterVisitor(String[] newlineInducingTags,String[] gapInducingTags, String gapText, boolean skipWhitespace, boolean processAll) {
- if(newlineInducingTags != null) {
+ public HtmlConverterVisitor(String[] newlineInducingTags, String newlineInducingTagRegExp,
+ String[] gapInducingTags, String gapText, boolean skipWhitespace, boolean processAll) {
+ if (newlineInducingTags != null) {
this.newlineInducingTags = Arrays.asList(newlineInducingTags);
}
- if(gapInducingTags != null) {
+ if (gapInducingTags != null) {
this.gapInducingTags = Arrays.asList(gapInducingTags);
}
this.gapText = gapText;
this.skipWhitespace = skipWhitespace;
this.processAll = processAll;
+ if (newlineInducingTagRegExp != null) {
+ newlineInducingTagPattern = Pattern.compile(newlineInducingTagRegExp);
+ }
}
@Override
public void visitStringNode(Text node) {
super.visitStringNode(node);
- if ((processAll || this.inBody) && !this.inScript && (!skipWhitespace || !StringUtils.isBlank(node.getText()))) {
+ if ((processAll || this.inBody) && !this.inScript
+ && (!skipWhitespace || !StringUtils.isBlank(node.getText()))) {
int from = node.getStartPosition();
int to = node.getEndPosition();
textSpans.add(new HtmlConverterPSpan(from, to, node.getText()));
@@ -86,14 +93,23 @@ public class HtmlConverterVisitor extend
} else if (trimmedTagnameLowercase.equals("script")) {
inScript = true;
}
- if (newlineInducingTags != null && newlineInducingTags.contains(trimmedTagnameLowercase)) {
+ boolean matchedByPattern = false;
+ if (newlineInducingTagPattern != null) {
+ Matcher matcher = newlineInducingTagPattern.matcher(trimmedTagnameLowercase);
+ if (matcher.matches()) {
+ matchedByPattern = true;
+ }
+ }
+ if (matchedByPattern
+ || (newlineInducingTags != null && newlineInducingTags
+ .contains(trimmedTagnameLowercase))) {
int begin = tag.getStartPosition();
linebreaksFromHtmlTags.add(new HtmlConverterPSpanReplacement(begin, begin + 1,
HtmlConverter.LINEBREAK));
}
if (gapInducingTags != null && gapInducingTags.contains(trimmedTagnameLowercase)) {
int begin = tag.getStartPosition();
- gapsFromHtmlTags.add(new HtmlConverterPSpanReplacement(begin, begin+gapText.length(),
+ gapsFromHtmlTags.add(new HtmlConverterPSpanReplacement(begin, begin + gapText.length(),
gapText));
}
}
@@ -115,7 +131,7 @@ public class HtmlConverterVisitor extend
public SortedSet<HtmlConverterPSpan> getLinebreaksFromHtmlTags() {
return linebreaksFromHtmlTags;
}
-
+
public SortedSet<HtmlConverterPSpan> getGapsFromHtmlTags() {
return gapsFromHtmlTags;
}
Modified: uima/ruta/trunk/ruta-core/src/main/resources/org/apache/uima/ruta/engine/HtmlConverter.xml
URL: http://svn.apache.org/viewvc/uima/ruta/trunk/ruta-core/src/main/resources/org/apache/uima/ruta/engine/HtmlConverter.xml?rev=1669095&r1=1669094&r2=1669095&view=diff
==============================================================================
--- uima/ruta/trunk/ruta-core/src/main/resources/org/apache/uima/ruta/engine/HtmlConverter.xml (original)
+++ uima/ruta/trunk/ruta-core/src/main/resources/org/apache/uima/ruta/engine/HtmlConverter.xml Wed Mar 25 11:28:12 2015
@@ -40,6 +40,12 @@
<mandatory>false</mandatory>
</configurationParameter>
<configurationParameter>
+ <name>newlineInducingTagRegExp</name>
+ <type>String</type>
+ <multiValued>false</multiValued>
+ <mandatory>false</mandatory>
+ </configurationParameter>
+ <configurationParameter>
<name>replaceLinebreaks</name>
<type>Boolean</type>
<multiValued>false</multiValued>