You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@uima.apache.org by pk...@apache.org on 2017/01/03 16:26:02 UTC
svn commit: r1777155 - in /uima/ruta/trunk/ruta-core/src:
main/java/org/apache/uima/ruta/engine/HtmlConverter.java
test/java/org/apache/uima/ruta/engine/HtmlConverterTest.java
test/java/org/apache/uima/ruta/engine/HtmlConverterXmlTest.java
Author: pkluegl
Date: Tue Jan 3 16:26:02 2017
New Revision: 1777155
URL: http://svn.apache.org/viewvc?rev=1777155&view=rev
Log:
UIMA-5147
- refactored config params
- added test
Modified:
uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/engine/HtmlConverter.java
uima/ruta/trunk/ruta-core/src/test/java/org/apache/uima/ruta/engine/HtmlConverterTest.java
uima/ruta/trunk/ruta-core/src/test/java/org/apache/uima/ruta/engine/HtmlConverterXmlTest.java
Modified: uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/engine/HtmlConverter.java
URL: http://svn.apache.org/viewvc/uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/engine/HtmlConverter.java?rev=1777155&r1=1777154&r2=1777155&view=diff
==============================================================================
--- uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/engine/HtmlConverter.java (original)
+++ uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/engine/HtmlConverter.java Tue Jan 3 16:26:02 2017
@@ -83,7 +83,7 @@ public class HtmlConverter extends JCasA
*/
public static final String PARAM_OUTPUT_VIEW = "outputView";
- @ConfigurationParameter(name = PARAM_OUTPUT_VIEW, mandatory = false, defaultValue = DEFAULT_MODIFIED_VIEW)
+ @ConfigurationParameter(name = PARAM_OUTPUT_VIEW, mandatory = true, defaultValue = DEFAULT_MODIFIED_VIEW)
private String modifiedViewName;
/**
@@ -101,7 +101,7 @@ public class HtmlConverter extends JCasA
public static final String PARAM_REPLACE_LINEBREAKS = "replaceLinebreaks";
@ConfigurationParameter(name = PARAM_REPLACE_LINEBREAKS, mandatory = false, defaultValue = "true")
- private Boolean replaceLinebreaks;
+ private boolean replaceLinebreaks;
/**
* This boolean parameter determines if the converter should skip whitespaces. Html documents
@@ -113,7 +113,7 @@ public class HtmlConverter extends JCasA
public static final String PARAM_SKIP_WHITESPACES = "skipWhitespaces";
@ConfigurationParameter(name = PARAM_SKIP_WHITESPACES, mandatory = false, defaultValue = "true")
- private Boolean skipWhitespaces;
+ private boolean skipWhitespaces;
/**
* If this boolean parameter is set to true, then the tags of the complete document is processed
@@ -121,8 +121,8 @@ public class HtmlConverter extends JCasA
*/
public static final String PARAM_PROCESS_ALL = "processAll";
- @ConfigurationParameter(name = PARAM_PROCESS_ALL, mandatory = false, defaultValue = "false")
- private Boolean processAll;
+ @ConfigurationParameter(name = PARAM_PROCESS_ALL, mandatory = true, defaultValue = "false")
+ private boolean processAll;
/**
* If this boolean parameter is set to true, then zero-length annotation will not be dropped, but
@@ -131,8 +131,8 @@ public class HtmlConverter extends JCasA
*/
public static final String PARAM_EXPAND_OFFSETS = "expandOffsets";
- @ConfigurationParameter(name = PARAM_EXPAND_OFFSETS, mandatory = false, defaultValue = "false")
- private Boolean expandOffsets;
+ @ConfigurationParameter(name = PARAM_EXPAND_OFFSETS, mandatory = true, defaultValue = "false")
+ private boolean expandOffsets;
/**
* This string parameter determines the character sequence that replaces a linebreak. The default
@@ -165,11 +165,11 @@ public class HtmlConverter extends JCasA
/**
* This string array parameter sets the names of the html tags that create additional text in the
- * output view. The acutal string of the gap is defined by the parameter <code>gapText</code>.
+ * output view. The actual string of the gap is defined by the parameter <code>gapText</code>.
*/
public static final String PARAM_GAP_INDUCING_TAGS = "gapInducingTags";
- @ConfigurationParameter(name = PARAM_GAP_INDUCING_TAGS, mandatory = false)
+ @ConfigurationParameter(name = PARAM_GAP_INDUCING_TAGS, mandatory = true, defaultValue = {})
private String[] gapInducingTags;
/**
@@ -178,7 +178,7 @@ public class HtmlConverter extends JCasA
*/
public static final String PARAM_GAP_TEXT = "gapText";
- @ConfigurationParameter(name = PARAM_GAP_TEXT, mandatory = false, defaultValue = "")
+ @ConfigurationParameter(name = PARAM_GAP_TEXT, mandatory = true, defaultValue = "")
private String gapText;
/**
@@ -186,8 +186,8 @@ public class HtmlConverter extends JCasA
*/
public static final String PARAM_USE_SPACE_GAP = "useSpaceGap";
- @ConfigurationParameter(name = PARAM_USE_SPACE_GAP, mandatory = false, defaultValue = "")
- private Boolean useSpaceGap;
+ @ConfigurationParameter(name = PARAM_USE_SPACE_GAP, mandatory = true, defaultValue = "false")
+ private boolean useSpaceGap;
/**
* This string array parameter can be used to apply custom conversions. It defaults to a list of
@@ -212,7 +212,7 @@ public class HtmlConverter extends JCasA
*/
public static final String PARAM_CONVERSION_POLICY = "conversionPolicy";
- @ConfigurationParameter(name = PARAM_CONVERSION_POLICY, mandatory = false, defaultValue = "heuristic")
+ @ConfigurationParameter(name = PARAM_CONVERSION_POLICY, mandatory = true, defaultValue = "heuristic")
private String conversionPolicy;
/**
@@ -232,17 +232,6 @@ public class HtmlConverter extends JCasA
@Override
public void initialize(UimaContext aContext) throws ResourceInitializationException {
super.initialize(aContext);
- inputViewName = (String) aContext.getConfigParameterValue(PARAM_INPUT_VIEW);
- inputViewName = StringUtils.isBlank(inputViewName) ? null : inputViewName;
- modifiedViewName = (String) aContext.getConfigParameterValue(PARAM_OUTPUT_VIEW);
- modifiedViewName = StringUtils.isBlank(modifiedViewName) ? DEFAULT_MODIFIED_VIEW
- : modifiedViewName;
- replaceLinebreaks = (Boolean) aContext.getConfigParameterValue(PARAM_REPLACE_LINEBREAKS);
- replaceLinebreaks = replaceLinebreaks == null ? true : replaceLinebreaks;
- skipWhitespaces = (Boolean) aContext.getConfigParameterValue(PARAM_SKIP_WHITESPACES);
- skipWhitespaces = skipWhitespaces == null ? true : skipWhitespaces;
- processAll = (Boolean) aContext.getConfigParameterValue(PARAM_PROCESS_ALL);
- processAll = processAll == null ? true : processAll;
linebreakReplacement = (String) aContext.getConfigParameterValue(PARAM_LINEBREAK_REPLACEMENT);
linebreakReplacement = linebreakReplacement == null ? "" : linebreakReplacement;
String conversionPolicy = (String) aContext.getConfigParameterValue(PARAM_CONVERSION_POLICY);
@@ -254,23 +243,11 @@ public class HtmlConverter extends JCasA
throw new ResourceInitializationException("illegal conversionPolicy parameter value",
new Object[0]);
}
- String[] nlTags = (String[]) aContext.getConfigParameterValue(PARAM_NEWLINE_INDUCING_TAGS);
- if (nlTags == null) {
- newlineInducingTags = new String[] { "br", "p", "div", "ul", "ol", "dl", "li", "h1", "h2",
- "h3", "h4", "h5", "h6", "blockquote" };
-
- }
// check assertions
if (modifiedViewName.equals(inputViewName)) {
throw new ResourceInitializationException("input and output view names must differ!",
new Object[0]);
}
- conversionPatterns = (String[]) aContext.getConfigParameterValue(PARAM_CONVERSION_PATTERNS);
- if (conversionPatterns == null) {
- conversionPatterns = new String[] { " ", "«", "»", """, "&",
- "<", ">", "'", "§", "¨", "©", "™", "®", "ö",
- "ä", "ü", " " };
- }
conversionReplacements = (String[]) aContext
.getConfigParameterValue(PARAM_CONVERSION_REPLACEMENTS);
if (conversionReplacements == null) {
@@ -282,24 +259,11 @@ public class HtmlConverter extends JCasA
}
}
- gapText = (String) aContext.getConfigParameterValue(PARAM_GAP_TEXT);
- gapText = gapText == null ? "" : gapText;
-
- useSpaceGap = (Boolean) aContext.getConfigParameterValue(PARAM_USE_SPACE_GAP);
- useSpaceGap = useSpaceGap == null ? false : useSpaceGap;
if (useSpaceGap) {
gapText = " ";
}
- gapInducingTags = (String[]) aContext.getConfigParameterValue(PARAM_GAP_INDUCING_TAGS);
- gapInducingTags = gapInducingTags == null ? new String[0] : gapInducingTags;
-
- expandOffsets = (Boolean) aContext.getConfigParameterValue(PARAM_EXPAND_OFFSETS);
- expandOffsets = expandOffsets == null ? false : expandOffsets;
-
- newlineInducingTagRegExp = (String) aContext
- .getConfigParameterValue(PARAM_NEWLINE_INDUCING_TAG_REGEXP);
}
@Override
Modified: uima/ruta/trunk/ruta-core/src/test/java/org/apache/uima/ruta/engine/HtmlConverterTest.java
URL: http://svn.apache.org/viewvc/uima/ruta/trunk/ruta-core/src/test/java/org/apache/uima/ruta/engine/HtmlConverterTest.java?rev=1777155&r1=1777154&r2=1777155&view=diff
==============================================================================
--- uima/ruta/trunk/ruta-core/src/test/java/org/apache/uima/ruta/engine/HtmlConverterTest.java (original)
+++ uima/ruta/trunk/ruta-core/src/test/java/org/apache/uima/ruta/engine/HtmlConverterTest.java Tue Jan 3 16:26:02 2017
@@ -515,4 +515,34 @@ public class HtmlConverterTest {
// fini
cas.release();
}
+
+ @Test
+ public void testStyle() throws Exception {
+ String html = "<html><head>\n" + "<style>\n" + "/* */\n" + ".test {\n" + " text-align: left;\n" + "}\n"
+ + "</style>\n" + "</head><body>Hello world</body></html>";
+ URL url = HtmlConverter.class.getClassLoader().getResource("HtmlConverter.xml");
+ if (url == null) {
+ url = HtmlConverter.class.getClassLoader().getResource(
+ "org/apache/uima/ruta/engine/HtmlConverter.xml");
+ }
+ XMLInputSource in = new XMLInputSource(url);
+ ResourceSpecifier specifier = UIMAFramework.getXMLParser().parseResourceSpecifier(in);
+ AnalysisEngine ae = UIMAFramework.produceAnalysisEngine(specifier);
+ CAS cas = ae.newCAS();
+
+ ae.setConfigParameterValue(HtmlConverter.PARAM_OUTPUT_VIEW, outputViewName);
+ ae.reconfigure();
+ cas.reset();
+ cas.setDocumentText(html);
+
+ ae.process(cas);
+
+ CAS modifiedView = cas.getView(outputViewName);
+ String text = modifiedView.getDocumentText();
+
+ String expectedText = "Hello world";
+ assertEquals(expectedText, text);
+
+ cas.release();
+ }
}
Modified: uima/ruta/trunk/ruta-core/src/test/java/org/apache/uima/ruta/engine/HtmlConverterXmlTest.java
URL: http://svn.apache.org/viewvc/uima/ruta/trunk/ruta-core/src/test/java/org/apache/uima/ruta/engine/HtmlConverterXmlTest.java?rev=1777155&r1=1777154&r2=1777155&view=diff
==============================================================================
--- uima/ruta/trunk/ruta-core/src/test/java/org/apache/uima/ruta/engine/HtmlConverterXmlTest.java (original)
+++ uima/ruta/trunk/ruta-core/src/test/java/org/apache/uima/ruta/engine/HtmlConverterXmlTest.java Tue Jan 3 16:26:02 2017
@@ -47,14 +47,14 @@ public class HtmlConverterXmlTest {
URL urlA = HtmlAnnotator.class.getClassLoader().getResource("HtmlAnnotator.xml");
if (urlA == null) {
- urlA = HtmlAnnotator.class.getClassLoader().getResource(
- "org/apache/uima/ruta/engine/HtmlAnnotator.xml");
+ urlA = HtmlAnnotator.class.getClassLoader()
+ .getResource("org/apache/uima/ruta/engine/HtmlAnnotator.xml");
}
URL urlC = HtmlAnnotator.class.getClassLoader().getResource("HtmlConverter.xml");
if (urlC == null) {
- urlC = HtmlAnnotator.class.getClassLoader().getResource(
- "org/apache/uima/ruta/engine/HtmlConverter.xml");
+ urlC = HtmlAnnotator.class.getClassLoader()
+ .getResource("org/apache/uima/ruta/engine/HtmlConverter.xml");
}
XMLInputSource inA = new XMLInputSource(urlA);
@@ -68,8 +68,8 @@ public class HtmlConverterXmlTest {
AnalysisEngine aeC = UIMAFramework.produceAnalysisEngine(specifierC);
aeC.setConfigParameterValue(HtmlConverter.PARAM_SKIP_WHITESPACES, false);
aeC.setConfigParameterValue(HtmlConverter.PARAM_PROCESS_ALL, true);
- aeC.setConfigParameterValue(HtmlConverter.PARAM_GAP_INDUCING_TAGS, new String[] { "child1",
- "child2", "child3" });
+ aeC.setConfigParameterValue(HtmlConverter.PARAM_GAP_INDUCING_TAGS,
+ new String[] { "child1", "child2", "child3" });
aeC.setConfigParameterValue(HtmlConverter.PARAM_GAP_TEXT, "$");
aeC.reconfigure();
@@ -107,14 +107,14 @@ public class HtmlConverterXmlTest {
URL urlA = HtmlAnnotator.class.getClassLoader().getResource("HtmlAnnotator.xml");
if (urlA == null) {
- urlA = HtmlAnnotator.class.getClassLoader().getResource(
- "org/apache/uima/ruta/engine/HtmlAnnotator.xml");
+ urlA = HtmlAnnotator.class.getClassLoader()
+ .getResource("org/apache/uima/ruta/engine/HtmlAnnotator.xml");
}
URL urlC = HtmlAnnotator.class.getClassLoader().getResource("HtmlConverter.xml");
if (urlC == null) {
- urlC = HtmlAnnotator.class.getClassLoader().getResource(
- "org/apache/uima/ruta/engine/HtmlConverter.xml");
+ urlC = HtmlAnnotator.class.getClassLoader()
+ .getResource("org/apache/uima/ruta/engine/HtmlConverter.xml");
}
XMLInputSource inA = new XMLInputSource(urlA);
@@ -161,9 +161,11 @@ public class HtmlConverterXmlTest {
next = iterator.next();
boolean b2 = next.getBooleanValue(expandedFeature);
assertEquals("More content.", next.getCoveredText());
- // for one of these two annotation (with same offsets) the feature must be set to true
+ // for one of these two annotation (with same offsets) the feature must be set to true
assertEquals(true, b1 || b2);
cas.release();
}
+
+
}