You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@uima.apache.org by pk...@apache.org on 2015/03/21 18:54:22 UTC
svn commit: r1668310 - in /uima/ruta/trunk/ruta-core/src:
main/java/org/apache/uima/ruta/engine/
main/resources/org/apache/uima/ruta/engine/
test/java/org/apache/uima/ruta/engine/
Author: pkluegl
Date: Sat Mar 21 17:54:22 2015
New Revision: 1668310
URL: http://svn.apache.org/r1668310
Log:
UIMA-4286
- added parameter gapInducingTags
- added parameter gapText
- added feature expandedOffsets
- added test
Added:
uima/ruta/trunk/ruta-core/src/test/java/org/apache/uima/ruta/engine/HtmlConverterXmlTest.java
Modified:
uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/engine/HtmlConverter.java
uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/engine/HtmlConverterVisitor.java
uima/ruta/trunk/ruta-core/src/main/resources/org/apache/uima/ruta/engine/HtmlTypeSystem.xml
Modified: uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/engine/HtmlConverter.java
URL: http://svn.apache.org/viewvc/uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/engine/HtmlConverter.java?rev=1668310&r1=1668309&r2=1668310&view=diff
==============================================================================
--- uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/engine/HtmlConverter.java (original)
+++ uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/engine/HtmlConverter.java Sat Mar 21 17:54:22 2015
@@ -33,6 +33,8 @@ import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.CASException;
+import org.apache.uima.cas.FSIterator;
+import org.apache.uima.cas.Feature;
import org.apache.uima.cas.Type;
import org.apache.uima.cas.TypeSystem;
import org.apache.uima.cas.text.AnnotationIndex;
@@ -71,10 +73,10 @@ public class HtmlConverter extends JCasA
public static final String NAMESPACE = "org.apache.uima.ruta.type.html.";
- private static final String DEFAULT_MODIFIED_VIEW = "plaintext";
+ public static final String DEFAULT_MODIFIED_VIEW = "plaintext";
public static final String LINEBREAK = "\n";
-
+
/**
* This string parameter specifies the name of the new view. The default value is
* <code>plaintext</code>.
@@ -114,14 +116,23 @@ public class HtmlConverter extends JCasA
private Boolean skipWhitespaces;
/**
- * TODO
+ * If this boolean parameter is set to true, then the tags of the complete document is processed
+ * and not only thos within the body tag.
*/
public static final String PARAM_PROCESS_ALL = "processAll";
@ConfigurationParameter(name = PARAM_PROCESS_ALL, mandatory = false, defaultValue = "false")
private Boolean processAll;
-
+ /**
+ * If this boolean parameter is set to true, then the tags of the complete document is processed
+ * and not only those tags within the body tag.
+ */
+ public static final String PARAM_EXPAND_OFFSETS = "expandOffsets";
+
+ @ConfigurationParameter(name = PARAM_EXPAND_OFFSETS, mandatory = false, defaultValue = "false")
+ private Boolean expandOffsets;
+
/**
* This string parameter determines the character sequence that replaces a linebreak. The default
* behavior is the empty string.
@@ -142,6 +153,24 @@ public class HtmlConverter extends JCasA
private String[] newlineInducingTags;
/**
+ * This string array parameter sets the names of the html tags that create additional text in the
+ * output view. The acutal string of the gap is defined by the parameter <code>gapText</code>.
+ */
+ public static final String PARAM_GAP_INDUCING_TAGS = "gapInducingTags";
+
+ @ConfigurationParameter(name = PARAM_GAP_INDUCING_TAGS, mandatory = false)
+ private String[] gapInducingTags;
+
+ /**
+ * This string parameter determines the character sequence that is introduced by the html tags
+ * specified in the <code>gapInducingTags</code>.
+ */
+ public static final String PARAM_GAP_TEXT = "apText";
+
+ @ConfigurationParameter(name = PARAM_GAP_TEXT, mandatory = false, defaultValue = "")
+ private String gapText;
+
+ /**
* This string array parameter can be used to apply custom conversions. It defaults to a list of
* commonly used codes, e.g., <![CDATA[ ]]>, which are converted using html 4 entity
* unescaping. However, explicit conversion strings can also be passed via the parameter
@@ -169,18 +198,16 @@ public class HtmlConverter extends JCasA
/**
* This string array parameter corresponds to <code>conversionPatterns</code> such that
- * <code>conversionPatterns[i]</code> will be replaced by
- * <code>conversionReplacements[i]</code>; replacements should be shorter than the source
- * pattern. Per default, the replacement strings are computed using Html4 decoding. Remember to
- * enable explicit conversion via <code>conversionPolicy</code> first.
+ * <code>conversionPatterns[i]</code> will be replaced by <code>conversionReplacements[i]</code>;
+ * replacements should be shorter than the source pattern. Per default, the replacement strings
+ * are computed using Html4 decoding. Remember to enable explicit conversion via
+ * <code>conversionPolicy</code> first.
*/
public static final String PARAM_CONVERSION_REPLACEMENTS = "conversionReplacements";
@ConfigurationParameter(name = PARAM_CONVERSION_REPLACEMENTS, mandatory = false)
private String[] conversionReplacements;
-
-
private int[] map;
@Override
@@ -225,7 +252,8 @@ public class HtmlConverter extends JCasA
"<", ">", "'", "§", "¨", "©", "™", "®", "ö",
"ä", "ü", " " };
}
- conversionReplacements = (String[]) aContext.getConfigParameterValue(PARAM_CONVERSION_REPLACEMENTS);
+ conversionReplacements = (String[]) aContext
+ .getConfigParameterValue(PARAM_CONVERSION_REPLACEMENTS);
if (conversionReplacements == null) {
conversionReplacements = new String[conversionPatterns.length];
for (int i = 0; i < conversionPatterns.length; i++) {
@@ -273,15 +301,18 @@ public class HtmlConverter extends JCasA
}
SortedSet<HtmlConverterPSpan> visibleSpansSoFar = new TreeSet<HtmlConverterPSpan>();
SortedSet<HtmlConverterPSpan> linebreaksFromHtmlTags = new TreeSet<HtmlConverterPSpan>();
+ SortedSet<HtmlConverterPSpan> gapsFromHtmlTags = new TreeSet<HtmlConverterPSpan>();
// process
try {
Parser parser = new Parser(documentText);
NodeList list = parser.parse(null);
- HtmlConverterVisitor visitor = new HtmlConverterVisitor(newlineInducingTags, skipWhitespaces, processAll);
+ HtmlConverterVisitor visitor = new HtmlConverterVisitor(newlineInducingTags, gapInducingTags,
+ gapText, skipWhitespaces, processAll);
list.visitAllNodesWith(visitor);
visibleSpansSoFar = visitor.getTextSpans();
linebreaksFromHtmlTags = visitor.getLinebreaksFromHtmlTags();
+ gapsFromHtmlTags = visitor.getGapsFromHtmlTags();
} catch (ParserException e) {
throw new AnalysisEngineProcessException(e);
}
@@ -298,6 +329,7 @@ public class HtmlConverter extends JCasA
}
}
visibleSpansSoFar.addAll(linebreaksFromHtmlTags);
+ visibleSpansSoFar.addAll(gapsFromHtmlTags);
// create new doc-text and the map from deletions and visible-text-spans:
StringBuffer sbu = new StringBuffer(documentText.length());
@@ -352,6 +384,7 @@ public class HtmlConverter extends JCasA
JCas modview = fromJcas.getView(toView);
Set<Annotation> indexedFs = new HashSet<Annotation>();
+ Set<Annotation> toExpand = new HashSet<Annotation>();
AnnotationIndex<Annotation> annotationIndex = fromJcas.getAnnotationIndex();
TypeSystem typeSystem = fromJcas.getTypeSystem();
Type docType = typeSystem.getType(UIMAConstants.TYPE_DOCUMENT);
@@ -383,8 +416,56 @@ public class HtmlConverter extends JCasA
getContext().getLogger().log(Level.WARNING, "illegal annotation offset mapping");
}
}
+ } else if (expandOffsets) {
+ clone.setBegin(mappedBegin);
+ clone.setEnd(mappedEnd);
+ toExpand.add(clone);
+ }
+ }
+
+ for (Annotation each : toExpand) {
+ Annotation nextBestAnnotation = getNextBestAnnotation(each, modview);
+ if (nextBestAnnotation != null) {
+ each.setBegin(nextBestAnnotation.getBegin());
+ each.setEnd(nextBestAnnotation.getEnd());
+ Feature expandedOffsetsFeature = each.getType().getFeatureByBaseName("expandedOffsets");
+ if (expandedOffsetsFeature != null) {
+ each.setBooleanValue(expandedOffsetsFeature, true);
+ }
+ modview.addFsToIndexes(each);
+ }
+ }
+ }
+
+ private Annotation getNextBestAnnotation(Annotation source, JCas jcas) {
+
+ FSIterator<Annotation> iterator = jcas.getAnnotationIndex().iterator(source);
+ Annotation best = null;
+ if (iterator.isValid()) {
+ Annotation annotation = iterator.get();
+ best = annotation;
+ } else {
+ Annotation dummy = new Annotation(jcas, source.getBegin(), source.getBegin() +1);
+ iterator = jcas.getAnnotationIndex().iterator(dummy);
+ if(!iterator.isValid()) {
+ if((jcas.getDocumentText().length()/ 2) > source.getBegin()) {
+ iterator.moveToFirst();
+ if (iterator.isValid()) {
+ Annotation annotation = iterator.get();
+ best = annotation;
+ }
+ } else {
+ iterator.moveToLast();
+ if (iterator.isValid()) {
+ Annotation annotation = iterator.get();
+ best = annotation;
+ }
+ }
}
}
+
+
+ return best;
}
private SortedSet<HtmlConverterPSpan> handleLinebreaksInDocumentText(
Modified: uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/engine/HtmlConverterVisitor.java
URL: http://svn.apache.org/viewvc/uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/engine/HtmlConverterVisitor.java?rev=1668310&r1=1668309&r2=1668310&view=diff
==============================================================================
--- uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/engine/HtmlConverterVisitor.java (original)
+++ uima/ruta/trunk/ruta-core/src/main/java/org/apache/uima/ruta/engine/HtmlConverterVisitor.java Sat Mar 21 17:54:22 2015
@@ -21,6 +21,7 @@ package org.apache.uima.ruta.engine;
import java.util.Arrays;
import java.util.Collection;
+import java.util.List;
import java.util.SortedSet;
import java.util.TreeSet;
@@ -43,12 +44,25 @@ public class HtmlConverterVisitor extend
private SortedSet<HtmlConverterPSpan> linebreaksFromHtmlTags = new TreeSet<HtmlConverterPSpan>();
+ private SortedSet<HtmlConverterPSpan> gapsFromHtmlTags = new TreeSet<HtmlConverterPSpan>();
+
private Collection<String> newlineInducingTags;
private boolean processAll = true;
- public HtmlConverterVisitor(String[] newlineInducingTags, boolean skipWhitespace, boolean processAll) {
- this.newlineInducingTags = Arrays.asList(newlineInducingTags);
+ private List<String> gapInducingTags;
+
+ private String gapText;
+
+
+ public HtmlConverterVisitor(String[] newlineInducingTags,String[] gapInducingTags, String gapText, boolean skipWhitespace, boolean processAll) {
+ if(newlineInducingTags != null) {
+ this.newlineInducingTags = Arrays.asList(newlineInducingTags);
+ }
+ if(gapInducingTags != null) {
+ this.gapInducingTags = Arrays.asList(gapInducingTags);
+ }
+ this.gapText = gapText;
this.skipWhitespace = skipWhitespace;
this.processAll = processAll;
}
@@ -68,15 +82,20 @@ public class HtmlConverterVisitor extend
super.visitTag(tag);
String trimmedTagnameLowercase = tag.getTagName().toLowerCase().trim();
if (trimmedTagnameLowercase.equals("body")) {
- this.inBody = true;
+ inBody = true;
} else if (trimmedTagnameLowercase.equals("script")) {
- this.inScript = true;
+ inScript = true;
}
- if (this.newlineInducingTags.contains(trimmedTagnameLowercase)) {
+ if (newlineInducingTags != null && newlineInducingTags.contains(trimmedTagnameLowercase)) {
int begin = tag.getStartPosition();
- this.linebreaksFromHtmlTags.add(new HtmlConverterPSpanReplacement(begin, begin + 1,
+ linebreaksFromHtmlTags.add(new HtmlConverterPSpanReplacement(begin, begin + 1,
HtmlConverter.LINEBREAK));
}
+ if (gapInducingTags != null && gapInducingTags.contains(trimmedTagnameLowercase)) {
+ int begin = tag.getStartPosition();
+ gapsFromHtmlTags.add(new HtmlConverterPSpanReplacement(begin, begin+gapText.length(),
+ gapText));
+ }
}
@Override
@@ -96,4 +115,8 @@ public class HtmlConverterVisitor extend
public SortedSet<HtmlConverterPSpan> getLinebreaksFromHtmlTags() {
return linebreaksFromHtmlTags;
}
+
+ public SortedSet<HtmlConverterPSpan> getGapsFromHtmlTags() {
+ return gapsFromHtmlTags;
+ }
}
\ No newline at end of file
Modified: uima/ruta/trunk/ruta-core/src/main/resources/org/apache/uima/ruta/engine/HtmlTypeSystem.xml
URL: http://svn.apache.org/viewvc/uima/ruta/trunk/ruta-core/src/main/resources/org/apache/uima/ruta/engine/HtmlTypeSystem.xml?rev=1668310&r1=1668309&r2=1668310&view=diff
==============================================================================
--- uima/ruta/trunk/ruta-core/src/main/resources/org/apache/uima/ruta/engine/HtmlTypeSystem.xml (original)
+++ uima/ruta/trunk/ruta-core/src/main/resources/org/apache/uima/ruta/engine/HtmlTypeSystem.xml Sat Mar 21 17:54:22 2015
@@ -35,6 +35,11 @@
<rangeTypeName>uima.cas.String</rangeTypeName>
</featureDescription>
<featureDescription>
+ <name>expandedOffsets</name>
+ <description></description>
+ <rangeTypeName>uima.cas.Boolean</rangeTypeName>
+ </featureDescription>
+ <featureDescription>
<name>attributeName</name>
<description></description>
<rangeTypeName>uima.cas.StringArray</rangeTypeName>
Added: uima/ruta/trunk/ruta-core/src/test/java/org/apache/uima/ruta/engine/HtmlConverterXmlTest.java
URL: http://svn.apache.org/viewvc/uima/ruta/trunk/ruta-core/src/test/java/org/apache/uima/ruta/engine/HtmlConverterXmlTest.java?rev=1668310&view=auto
==============================================================================
--- uima/ruta/trunk/ruta-core/src/test/java/org/apache/uima/ruta/engine/HtmlConverterXmlTest.java (added)
+++ uima/ruta/trunk/ruta-core/src/test/java/org/apache/uima/ruta/engine/HtmlConverterXmlTest.java Sat Mar 21 17:54:22 2015
@@ -0,0 +1,167 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.uima.ruta.engine;
+
+import static org.junit.Assert.assertEquals;
+
+import java.net.URL;
+
+import org.apache.uima.UIMAFramework;
+import org.apache.uima.analysis_engine.AnalysisEngine;
+import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.FSIterator;
+import org.apache.uima.cas.Feature;
+import org.apache.uima.cas.Type;
+import org.apache.uima.cas.text.AnnotationFS;
+import org.apache.uima.cas.text.AnnotationIndex;
+import org.apache.uima.resource.ResourceSpecifier;
+import org.apache.uima.util.XMLInputSource;
+import org.junit.Test;
+
+public class HtmlConverterXmlTest {
+
+ @Test
+ public void test() throws Exception {
+ String html = "<Parent>\n";
+ html += "<Child1>Some content</Child1>\n";
+ html += "<Child2 attribute=âsomeValueâ />\n";
+ html += "<Child3>More content.</Child3>\n";
+ html += "</Parent>\n";
+
+ URL urlA = HtmlAnnotator.class.getClassLoader().getResource("HtmlAnnotator.xml");
+ if (urlA == null) {
+ urlA = HtmlAnnotator.class.getClassLoader().getResource(
+ "org/apache/uima/ruta/engine/HtmlAnnotator.xml");
+ }
+
+ URL urlC = HtmlAnnotator.class.getClassLoader().getResource("HtmlConverter.xml");
+ if (urlC == null) {
+ urlC = HtmlAnnotator.class.getClassLoader().getResource(
+ "org/apache/uima/ruta/engine/HtmlConverter.xml");
+ }
+
+ XMLInputSource inA = new XMLInputSource(urlA);
+ ResourceSpecifier specifierA = UIMAFramework.getXMLParser().parseResourceSpecifier(inA);
+ AnalysisEngine aeA = UIMAFramework.produceAnalysisEngine(specifierA);
+ aeA.setConfigParameterValue(HtmlAnnotator.PARAM_ONLY_CONTENT, false);
+ aeA.reconfigure();
+
+ XMLInputSource inC = new XMLInputSource(urlC);
+ ResourceSpecifier specifierC = UIMAFramework.getXMLParser().parseResourceSpecifier(inC);
+ AnalysisEngine aeC = UIMAFramework.produceAnalysisEngine(specifierC);
+ aeC.setConfigParameterValue(HtmlConverter.PARAM_SKIP_WHITESPACES, false);
+ aeC.setConfigParameterValue(HtmlConverter.PARAM_PROCESS_ALL, true);
+ aeC.setConfigParameterValue(HtmlConverter.PARAM_GAP_INDUCING_TAGS, new String[] { "child1",
+ "child2", "child3" });
+ aeC.setConfigParameterValue(HtmlConverter.PARAM_GAP_TEXT, "$");
+ aeC.reconfigure();
+
+ CAS cas = aeA.newCAS();
+ Type tagType = cas.getTypeSystem().getType(HtmlAnnotator.NAMESPACE + "TAG");
+ AnnotationIndex<AnnotationFS> ai = null;
+ FSIterator<AnnotationFS> iterator = null;
+
+ cas.setDocumentText(html);
+ aeA.process(cas);
+ aeC.process(cas);
+
+ CAS plainTextCas = cas.getView(HtmlConverter.DEFAULT_MODIFIED_VIEW);
+
+ assertEquals("$Some content$$More content.", plainTextCas.getDocumentText());
+
+ ai = plainTextCas.getAnnotationIndex(tagType);
+ iterator = ai.iterator();
+ assertEquals(4, ai.size());
+ assertEquals("$Some content$$More content.", iterator.next().getCoveredText());
+ assertEquals("$Some content", iterator.next().getCoveredText());
+ assertEquals("$", iterator.next().getCoveredText());
+ assertEquals("$More content.", iterator.next().getCoveredText());
+
+ cas.release();
+ }
+
+ @Test
+ public void testExpandOffsets() throws Exception {
+ String html = "<Parent>\n";
+ html += "<Child1>Some content</Child1>\n";
+ html += "<Child2 attribute=âsomeValueâ />\n";
+ html += "<Child3>More content.</Child3>\n";
+ html += "</Parent>\n";
+
+ URL urlA = HtmlAnnotator.class.getClassLoader().getResource("HtmlAnnotator.xml");
+ if (urlA == null) {
+ urlA = HtmlAnnotator.class.getClassLoader().getResource(
+ "org/apache/uima/ruta/engine/HtmlAnnotator.xml");
+ }
+
+ URL urlC = HtmlAnnotator.class.getClassLoader().getResource("HtmlConverter.xml");
+ if (urlC == null) {
+ urlC = HtmlAnnotator.class.getClassLoader().getResource(
+ "org/apache/uima/ruta/engine/HtmlConverter.xml");
+ }
+
+ XMLInputSource inA = new XMLInputSource(urlA);
+ ResourceSpecifier specifierA = UIMAFramework.getXMLParser().parseResourceSpecifier(inA);
+ AnalysisEngine aeA = UIMAFramework.produceAnalysisEngine(specifierA);
+ aeA.setConfigParameterValue(HtmlAnnotator.PARAM_ONLY_CONTENT, false);
+ aeA.reconfigure();
+
+ XMLInputSource inC = new XMLInputSource(urlC);
+ ResourceSpecifier specifierC = UIMAFramework.getXMLParser().parseResourceSpecifier(inC);
+ AnalysisEngine aeC = UIMAFramework.produceAnalysisEngine(specifierC);
+ aeC.setConfigParameterValue(HtmlConverter.PARAM_SKIP_WHITESPACES, false);
+ aeC.setConfigParameterValue(HtmlConverter.PARAM_PROCESS_ALL, true);
+ aeC.setConfigParameterValue(HtmlConverter.PARAM_EXPAND_OFFSETS, true);
+ aeC.reconfigure();
+
+ CAS cas = aeA.newCAS();
+ Type tagType = cas.getTypeSystem().getType(HtmlAnnotator.NAMESPACE + "TAG");
+ Feature expandedFeature = tagType.getFeatureByBaseName("expandedOffsets");
+ AnnotationIndex<AnnotationFS> ai = null;
+ FSIterator<AnnotationFS> iterator = null;
+
+ cas.setDocumentText(html);
+ aeA.process(cas);
+ aeC.process(cas);
+
+ CAS plainTextCas = cas.getView(HtmlConverter.DEFAULT_MODIFIED_VIEW);
+
+ assertEquals("Some contentMore content.", plainTextCas.getDocumentText());
+
+ ai = plainTextCas.getAnnotationIndex(tagType);
+ iterator = ai.iterator();
+ assertEquals(4, ai.size());
+ AnnotationFS next = null ;
+ next = iterator.next();
+ assertEquals(false, next.getBooleanValue(expandedFeature));
+ assertEquals("Some contentMore content.", next.getCoveredText());
+ next = iterator.next();
+ assertEquals(false, next.getBooleanValue(expandedFeature));
+ assertEquals("Some content", next.getCoveredText());
+ next = iterator.next();
+ assertEquals(false, next.getBooleanValue(expandedFeature));
+ assertEquals("More content.", next.getCoveredText());
+ next = iterator.next();
+ assertEquals(true, next.getBooleanValue(expandedFeature));
+ assertEquals("More content.", next.getCoveredText());
+
+ cas.release();
+ }
+}