You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@uima.apache.org by pk...@apache.org on 2013/02/22 11:12:29 UTC

svn commit: r1448973 - in /uima/sandbox/textmarker/trunk: textmarker-core/src/main/java/org/apache/uima/textmarker/engine/ textmarker-core/src/main/resources/org/apache/uima/textmarker/engine/ textmarker-core/src/test/java/org/apache/uima/textmarker/ t...

Author: pkluegl
Date: Fri Feb 22 10:12:28 2013
New Revision: 1448973

URL: http://svn.apache.org/r1448973
Log:
UIMA-2536
- applied martin's patch with some smaller modifications

Added:
    uima/sandbox/textmarker/trunk/textmarker-core/src/main/java/org/apache/uima/textmarker/engine/HtmlConverter.java
    uima/sandbox/textmarker/trunk/textmarker-core/src/main/java/org/apache/uima/textmarker/engine/HtmlConverterPSpan.java
    uima/sandbox/textmarker/trunk/textmarker-core/src/main/java/org/apache/uima/textmarker/engine/HtmlConverterPSpanReplacement.java
    uima/sandbox/textmarker/trunk/textmarker-core/src/main/java/org/apache/uima/textmarker/engine/HtmlConverterVisitor.java
    uima/sandbox/textmarker/trunk/textmarker-core/src/main/resources/org/apache/uima/textmarker/engine/HtmlConverter.xml
    uima/sandbox/textmarker/trunk/textmarker-core/src/test/java/org/apache/uima/textmarker/engine/HtmlConverterTest.java
    uima/sandbox/textmarker/trunk/textmarker-core/src/test/resources/org/apache/uima/textmarker/engine/testHtmlDecoding.html
    uima/sandbox/textmarker/trunk/textmarker-core/src/test/resources/org/apache/uima/textmarker/engine/testHtmlDecodingAnnotations.html
    uima/sandbox/textmarker/trunk/textmarker-core/src/test/resources/org/apache/uima/textmarker/engine/testUnix.html
    uima/sandbox/textmarker/trunk/textmarker-core/src/test/resources/org/apache/uima/textmarker/engine/testWin.html
    uima/sandbox/textmarker/trunk/textmarker-core/src/test/resources/org/apache/uima/textmarker/engine/testWithComments.html
    uima/sandbox/textmarker/trunk/textmarker-core/src/test/resources/org/apache/uima/textmarker/engine/testWithCommentsAndScript.html
Modified:
    uima/sandbox/textmarker/trunk/textmarker-core/src/test/java/org/apache/uima/textmarker/AllTests.java
    uima/sandbox/textmarker/trunk/textmarker-docbook/src/docbook/tools.textmarker.overview.xml
    uima/sandbox/textmarker/trunk/textmarker-ep-ide/src/main/java/org/apache/uima/textmarker/ide/ui/wizards/TextMarkerProjectCreationWizard.java

Added: uima/sandbox/textmarker/trunk/textmarker-core/src/main/java/org/apache/uima/textmarker/engine/HtmlConverter.java
URL: http://svn.apache.org/viewvc/uima/sandbox/textmarker/trunk/textmarker-core/src/main/java/org/apache/uima/textmarker/engine/HtmlConverter.java?rev=1448973&view=auto
==============================================================================
--- uima/sandbox/textmarker/trunk/textmarker-core/src/main/java/org/apache/uima/textmarker/engine/HtmlConverter.java (added)
+++ uima/sandbox/textmarker/trunk/textmarker-core/src/main/java/org/apache/uima/textmarker/engine/HtmlConverter.java Fri Feb 22 10:12:28 2013
@@ -0,0 +1,405 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.uima.textmarker.engine;
+
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.Set;
+import java.util.SortedSet;
+import java.util.TreeSet;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.commons.lang3.StringEscapeUtils;
+import org.apache.commons.lang3.StringUtils;
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.CASException;
+import org.apache.uima.cas.text.AnnotationIndex;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.apache.uima.jcas.tcas.DocumentAnnotation;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.apache.uima.util.CasCopier;
+import org.apache.uima.util.Level;
+import org.htmlparser.Parser;
+import org.htmlparser.util.NodeList;
+import org.htmlparser.util.ParserException;
+
+/**
+ * <p>
+ * This is a basic html/xml to text converter that maintains annotations. <br />
+ * Note that it is recommended to preprocess/prettify the html <i>before</i> any annotations are
+ * added to the document.
+ * </p>
+ * <p>
+ * how to handle: <br />
+ * - TODO tables? <br />
+ * - TODO lists (ul, ol) <br />
+ * - TODO ... <br />
+ * </p>
+ * 
+ */
+public class HtmlConverter extends JCasAnnotator_ImplBase {
+  // parameter names:
+  public static final String NAMESPACE = "org.apache.uima.textmarker.type.html.";
+
+  public static final String OUTPUT_VIEW = "outputView";
+
+  public static final String INPUT_VIEW = "inputView";
+
+  public static final String REPLACE_LINEBREAKS = "replaceLinebreaks";
+
+  public static final String LINEBREAK = "\n";
+
+  public static final String NEWLINE_INDUCING_TAGS = "newlineInducingTags";
+
+  public static final String CONVERSION_POLICY = "conversionPolicy";
+
+  public static final String CONVERSION_PATTERNS = "conversionPatterns";
+
+  public static final String CONVERSION_REPLACEMENTS = "conversionReplacements";
+
+  // default values:
+  private static final String DEFAULT_MODIFIED_VIEW = "plaintext";
+
+  // variables:
+  private String inputViewName;
+
+  private String modifiedViewName;
+
+  private Set<String> newlineInducingTags;
+
+  private String[] conversionPatterns;
+
+  private String[] conversionReplacements;
+
+  private Boolean replaceLinebreaks;
+
+  enum StringConversionPolicy {
+    HEURISTIC, EXPLICIT, NONE
+  }
+
+  private StringConversionPolicy conversionPolicy;
+
+  private int[] map;
+
+  @Override
+  public void initialize(UimaContext aContext) throws ResourceInitializationException {
+    super.initialize(aContext);
+    inputViewName = (String) aContext.getConfigParameterValue(INPUT_VIEW);
+    inputViewName = StringUtils.isBlank(inputViewName) ? null : inputViewName;
+    modifiedViewName = (String) aContext.getConfigParameterValue(OUTPUT_VIEW);
+    modifiedViewName = StringUtils.isBlank(modifiedViewName) ? DEFAULT_MODIFIED_VIEW
+            : modifiedViewName;
+    replaceLinebreaks = (Boolean) aContext.getConfigParameterValue(REPLACE_LINEBREAKS);
+    replaceLinebreaks = replaceLinebreaks == null ? true : replaceLinebreaks;
+    String conversionPolicyString = (String) aContext.getConfigParameterValue(CONVERSION_POLICY);
+    conversionPolicyString = conversionPolicyString == null ? null : conversionPolicyString
+            .toLowerCase();
+    if (StringUtils.isBlank(conversionPolicyString) || conversionPolicyString.equals("heuristic")) {
+      conversionPolicy = StringConversionPolicy.HEURISTIC;
+    } else if (conversionPolicyString.equals("explicit")) {
+      conversionPolicy = StringConversionPolicy.EXPLICIT;
+    } else if (conversionPolicyString.equals("none")) {
+      conversionPolicy = StringConversionPolicy.NONE;
+    } else {
+      throw new ResourceInitializationException("illegal conversionPolicy parameter value",
+              new Object[0]);
+    }
+    newlineInducingTags = new HashSet<String>();
+    String[] nlTags = (String[]) aContext.getConfigParameterValue(NEWLINE_INDUCING_TAGS);
+    if (nlTags == null || nlTags.length == 0) {
+      newlineInducingTags.add("br");
+      newlineInducingTags.add("p");
+      newlineInducingTags.add("div");
+      newlineInducingTags.add("ul");
+      newlineInducingTags.add("ol");
+      newlineInducingTags.add("dl");
+      newlineInducingTags.add("li");
+      newlineInducingTags.add("h1");
+      newlineInducingTags.add("h2");
+      newlineInducingTags.add("h3");
+      newlineInducingTags.add("h4");
+      newlineInducingTags.add("h5");
+      newlineInducingTags.add("h6");
+      newlineInducingTags.add("blockquote");
+    } else {
+      for (String nlTag : nlTags) {
+        newlineInducingTags.add(nlTag);
+      }
+      // check assertions
+      if (modifiedViewName.equals(inputViewName)) {
+        throw new ResourceInitializationException("input and output view names must differ!",
+                new Object[0]);
+      }
+    }
+    conversionPatterns = (String[]) aContext.getConfigParameterValue(CONVERSION_PATTERNS);
+    if (conversionPatterns == null) {
+      conversionPatterns = new String[] { "&nbsp;", "&laquo;", "&raquo;", "&quot;", "&amp;",
+          "&lt;", "&gt;", "&apos;", "&sect;", "&uml;", "&copy;", "&trade;", "&reg;", "&ouml;",
+          "&auml;", "&uuml;", "&#160;" };
+    }
+    conversionReplacements = (String[]) aContext.getConfigParameterValue(CONVERSION_REPLACEMENTS);
+    if (conversionReplacements == null) {
+      conversionReplacements = new String[conversionPatterns.length];
+      for (int i = 0; i < conversionPatterns.length; i++) {
+        String c = conversionPatterns[i];
+        String rep = StringEscapeUtils.unescapeHtml4(c);
+        conversionReplacements[i] = rep;
+      }
+    }
+  }
+
+  @Override
+  public void process(JCas jcaz) throws AnalysisEngineProcessException {
+    JCas jcas;
+    try {
+      if (inputViewName != null) {
+        jcas = jcaz.getView(inputViewName);
+      } else {
+        jcas = jcaz;
+      }
+    } catch (CASException e1) {
+      throw new AnalysisEngineProcessException(e1.getCause());
+    }
+    // init:
+    String documentText = jcas.getDocumentText();
+    String splitSeq = documentText.contains("\r\n") ? "\r\n" : "\n";
+    map = new int[documentText.length() + 1];
+    JCas modview = null;
+    try {
+      // check if view already exists:
+      Iterator<JCas> viewIterator = jcas.getViewIterator();
+      while (viewIterator.hasNext()) {
+        JCas jCas2 = (JCas) viewIterator.next();
+        if (jCas2.getViewName().equals(modifiedViewName)) {
+          modview = jCas2;
+          getContext().getLogger().log(Level.WARNING,
+                  "view with name \"" + modifiedViewName + "\" already exists.");
+        }
+      }
+      if (modview == null) {
+        modview = jcas.createView(modifiedViewName);
+      }
+    } catch (CASException e) {
+      e.printStackTrace();
+      return;
+    }
+    SortedSet<HtmlConverterPSpan> visibleSpansSoFar = new TreeSet<HtmlConverterPSpan>();
+    SortedSet<HtmlConverterPSpan> linebreaksFromHtmlTags = new TreeSet<HtmlConverterPSpan>();
+
+    // process
+    try {
+      Parser parser = new Parser(documentText);
+      NodeList list = parser.parse(null);
+      HtmlConverterVisitor visitor = new HtmlConverterVisitor(newlineInducingTags);
+      list.visitAllNodesWith(visitor);
+      visibleSpansSoFar = visitor.getTextSpans();
+      linebreaksFromHtmlTags = visitor.getLinebreaksFromHtmlTags();
+    } catch (ParserException e) {
+      throw new AnalysisEngineProcessException(e);
+    }
+    if (replaceLinebreaks) {
+      visibleSpansSoFar = this.handleLinebreaksInDocumentText(visibleSpansSoFar, splitSeq);
+    }
+    if (conversionPolicy == StringConversionPolicy.HEURISTIC) {
+      visibleSpansSoFar = this.htmlDecoding(visibleSpansSoFar);
+    } else if (conversionPolicy == StringConversionPolicy.EXPLICIT) {
+      for (int i = 0; i < conversionPatterns.length; i++) {
+        String pat = conversionPatterns[i];
+        String rep = conversionReplacements[i];
+        visibleSpansSoFar = this.handleConversion(visibleSpansSoFar, pat, rep);
+      }
+    }
+    visibleSpansSoFar.addAll(linebreaksFromHtmlTags);
+
+    // create new doc-text and the map from deletions and visible-text-spans:
+    StringBuffer sbu = new StringBuffer(documentText.length());
+    int originalOffsetI = 0;
+    int outOffset = 0;
+    for (HtmlConverterPSpan vis : visibleSpansSoFar) {
+      final int begin = vis.getBegin();
+      final int end = vis.getEnd();
+
+      // map text before annotation:
+      while (originalOffsetI < begin) {
+        map[originalOffsetI++] = outOffset;
+      }
+
+      // get and map text/replacement:
+      String s = "";
+      if (vis instanceof HtmlConverterPSpanReplacement) {
+        // conversion/replacement:
+        s = vis.getTxt();
+        // asserts that s is shorter than the original source
+        while (originalOffsetI < begin + s.length()) {
+          map[originalOffsetI++] = outOffset++;
+        }
+        while (originalOffsetI < end) {
+          map[originalOffsetI++] = outOffset;
+        }
+      } else {
+        // simple annotation:
+        s = documentText.substring(begin, end);
+        while (originalOffsetI < end) {
+          map[originalOffsetI++] = outOffset++;
+        }
+      }
+      sbu.append(s);
+    }
+    while (originalOffsetI < documentText.length()) {
+      map[originalOffsetI++] = outOffset;
+    }
+    map[documentText.length()] = outOffset + 1; // handle doc end separately
+    String modTxt = sbu.toString();
+    modview.setDocumentText(modTxt);
+
+    // copy annotations using the 'map':
+    try {
+      mapAnnotations(jcas, map, modifiedViewName);
+    } catch (CASException e) {
+      e.printStackTrace();
+    }
+  }
+
+  private void mapAnnotations(JCas fromJcas, int[] map, String toView) throws CASException {
+    JCas modview = fromJcas.getView(toView);
+
+    Set<Annotation> indexedFs = new HashSet<Annotation>();
+    AnnotationIndex<Annotation> annotationIndex = fromJcas.getAnnotationIndex();
+
+    CasCopier casCopier = new CasCopier(fromJcas.getCas(), modview.getCas());
+    for (Annotation annotation : annotationIndex) {
+      // TODO be careful here, because some people inherit from DocumentAnnotation
+      if (annotation instanceof DocumentAnnotation) {
+        continue;
+      }
+      Annotation clone = (Annotation) casCopier.copyFs(annotation);
+      // change the view/sofa of the new annotation...
+      // see: http://osdir.com/ml/apache.uima.general/2007-09/msg00107.html
+      clone.setFeatureValue(modview.getTypeSystem()
+              .getFeatureByFullName(CAS.FEATURE_FULL_NAME_SOFA), modview.getSofa());
+      final int mappedBegin = map[clone.getBegin()];
+      final int mappedEnd = map[clone.getEnd()];
+      if (mappedBegin < mappedEnd) {
+        if (mappedEnd > fromJcas.getCas().getDocumentAnnotation().getEnd()) {
+          getContext().getLogger().log(Level.WARNING, "illegal annotation offset mapping");
+        } else {
+          clone.setBegin(mappedBegin);
+          clone.setEnd(mappedEnd);
+          // TODO handle nested annotation features
+          modview.addFsToIndexes(clone);
+          indexedFs.add(clone);
+        }
+      }
+    }
+  }
+
+  private SortedSet<HtmlConverterPSpan> handleLinebreaksInDocumentText(
+          SortedSet<HtmlConverterPSpan> visibleSpansSoFar, String splitSeq) {
+    return this.handleConversion(visibleSpansSoFar, splitSeq, "");
+  }
+
+  private SortedSet<HtmlConverterPSpan> htmlDecoding(SortedSet<HtmlConverterPSpan> visibleSpansSoFar) {
+    TreeSet<HtmlConverterPSpan> copy = new TreeSet<HtmlConverterPSpan>(visibleSpansSoFar);
+
+    Pattern patt = Pattern.compile("(&[a-zA-Z]{2,5};)|(&#\\d{2,5};)");
+
+    for (HtmlConverterPSpan pSpan : visibleSpansSoFar) {
+      String spanTxt = pSpan.getTxt();
+      Matcher matcher = patt.matcher(spanTxt);
+
+      if (matcher.find()) {
+        copy.remove(pSpan);
+        int pSpanBegin = pSpan.getBegin();
+        int ioff = pSpan.getBegin();
+        do {
+          String sourceString = matcher.group();
+          String replacement = StringEscapeUtils.unescapeHtml4(sourceString);
+          HtmlConverterPSpanReplacement replacementSpan = new HtmlConverterPSpanReplacement(
+                  pSpanBegin + matcher.start(), pSpanBegin + matcher.end(), replacement);
+          copy.add(replacementSpan);
+
+          int replacementLength = sourceString.length();
+          if (pSpanBegin + matcher.end() > ioff + replacementLength) {
+            int ib = ioff;
+            int ie = pSpanBegin + matcher.start();
+            String newTxt = spanTxt.substring(ib - pSpanBegin, ie - pSpanBegin);
+            copy.add(new HtmlConverterPSpan(ib, ie, newTxt));
+            ioff = ie;
+          }
+          ioff += replacementLength; //
+        } while (matcher.find());
+        if (ioff < pSpan.getEnd()) {
+          int ib = ioff;
+          int ie = pSpan.getEnd();
+          String newTxt = spanTxt.substring(ib - pSpanBegin, ie - pSpanBegin);
+          copy.add(new HtmlConverterPSpan(ioff, pSpan.getEnd(), newTxt));
+        }
+      }
+    }
+    return copy;
+  }
+
+  private SortedSet<HtmlConverterPSpan> handleConversion(
+          SortedSet<HtmlConverterPSpan> visibleSpansSoFar, String patternString, String replacement) {
+    TreeSet<HtmlConverterPSpan> copy = new TreeSet<HtmlConverterPSpan>(visibleSpansSoFar);
+
+    Pattern patt = Pattern.compile(patternString);
+    int replacementLength = patternString.length();
+
+    for (HtmlConverterPSpan pSpan : visibleSpansSoFar) {
+      String spanTxt = pSpan.getTxt();
+      Matcher matcher = patt.matcher(spanTxt);
+
+      if (matcher.find()) {
+        copy.remove(pSpan);
+        int pSpanBegin = pSpan.getBegin();
+        int ioff = pSpan.getBegin();
+        do {
+          if (!StringUtils.isEmpty(replacement)) {
+            HtmlConverterPSpanReplacement replacementSpan = new HtmlConverterPSpanReplacement(
+                    pSpanBegin + matcher.start(), pSpanBegin + matcher.end(), replacement);
+            copy.add(replacementSpan);
+          }
+          if (pSpanBegin + matcher.end() > ioff + replacementLength) {
+            int ib = ioff;
+            int ie = pSpanBegin + matcher.start();
+            String newTxt = spanTxt.substring(ib - pSpanBegin, ie - pSpanBegin);
+            copy.add(new HtmlConverterPSpan(ib, ie, newTxt));
+            ioff = ie;
+          }
+          ioff += replacementLength; //
+        } while (matcher.find());
+        if (ioff < pSpan.getEnd()) {
+          int ib = ioff;
+          int ie = pSpan.getEnd();
+          String newTxt = spanTxt.substring(ib - pSpanBegin, ie - pSpanBegin);
+          copy.add(new HtmlConverterPSpan(ioff, pSpan.getEnd(), newTxt));
+        }
+      }
+    }
+    return copy;
+  }
+
+}

Added: uima/sandbox/textmarker/trunk/textmarker-core/src/main/java/org/apache/uima/textmarker/engine/HtmlConverterPSpan.java
URL: http://svn.apache.org/viewvc/uima/sandbox/textmarker/trunk/textmarker-core/src/main/java/org/apache/uima/textmarker/engine/HtmlConverterPSpan.java?rev=1448973&view=auto
==============================================================================
--- uima/sandbox/textmarker/trunk/textmarker-core/src/main/java/org/apache/uima/textmarker/engine/HtmlConverterPSpan.java (added)
+++ uima/sandbox/textmarker/trunk/textmarker-core/src/main/java/org/apache/uima/textmarker/engine/HtmlConverterPSpan.java Fri Feb 22 10:12:28 2013
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.uima.textmarker.engine;
+
+public class HtmlConverterPSpan implements Comparable<HtmlConverterPSpan> {
+  private int begin;
+
+  private int end;
+
+  private String txt;
+
+  public HtmlConverterPSpan(int begin, int end, String txt) {
+    super();
+    this.begin = begin;
+    this.end = end;
+    this.txt = txt;
+  }
+
+  public String getTxt() {
+    return txt;
+  }
+
+  public int getLength() {
+    return this.end - this.begin;
+  }
+
+  public int getBegin() {
+    return begin;
+  }
+
+  public int getEnd() {
+    return end;
+  }
+
+  public int compareTo(HtmlConverterPSpan o) {
+    if (this.begin == o.begin) {
+      if (this.end == o.end) {
+        return 0;
+      }
+      return this.end < o.end ? -1 : +1;
+    }
+    return this.begin < o.end ? -1 : +1;
+  }
+
+  @Override
+  public int hashCode() {
+    final int prime = 31;
+    int result = 1;
+    result = prime * result + begin;
+    result = prime * result + end;
+    return result;
+  }
+
+  @Override
+  public boolean equals(Object obj) {
+    if (this == obj)
+      return true;
+    if (obj == null)
+      return false;
+    if (getClass() != obj.getClass())
+      return false;
+    HtmlConverterPSpan other = (HtmlConverterPSpan) obj;
+    if (begin != other.begin)
+      return false;
+    if (end != other.end)
+      return false;
+    return true;
+  }
+
+  @Override
+  public String toString() {
+    String shortTxt = this.txt.length() > 10 ? this.txt.substring(0, 10) + "..." : this.txt;
+    return String.format("[%d-%d : %s]", begin, end, shortTxt);
+  }
+}
\ No newline at end of file

Added: uima/sandbox/textmarker/trunk/textmarker-core/src/main/java/org/apache/uima/textmarker/engine/HtmlConverterPSpanReplacement.java
URL: http://svn.apache.org/viewvc/uima/sandbox/textmarker/trunk/textmarker-core/src/main/java/org/apache/uima/textmarker/engine/HtmlConverterPSpanReplacement.java?rev=1448973&view=auto
==============================================================================
--- uima/sandbox/textmarker/trunk/textmarker-core/src/main/java/org/apache/uima/textmarker/engine/HtmlConverterPSpanReplacement.java (added)
+++ uima/sandbox/textmarker/trunk/textmarker-core/src/main/java/org/apache/uima/textmarker/engine/HtmlConverterPSpanReplacement.java Fri Feb 22 10:12:28 2013
@@ -0,0 +1,27 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.uima.textmarker.engine;
+
+public class HtmlConverterPSpanReplacement extends HtmlConverterPSpan {
+
+  public HtmlConverterPSpanReplacement(int begin, int end, String txt) {
+    super(begin, end, txt);
+  }
+}
\ No newline at end of file

Added: uima/sandbox/textmarker/trunk/textmarker-core/src/main/java/org/apache/uima/textmarker/engine/HtmlConverterVisitor.java
URL: http://svn.apache.org/viewvc/uima/sandbox/textmarker/trunk/textmarker-core/src/main/java/org/apache/uima/textmarker/engine/HtmlConverterVisitor.java?rev=1448973&view=auto
==============================================================================
--- uima/sandbox/textmarker/trunk/textmarker-core/src/main/java/org/apache/uima/textmarker/engine/HtmlConverterVisitor.java (added)
+++ uima/sandbox/textmarker/trunk/textmarker-core/src/main/java/org/apache/uima/textmarker/engine/HtmlConverterVisitor.java Fri Feb 22 10:12:28 2013
@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.uima.textmarker.engine;
+
+import java.util.Set;
+import java.util.SortedSet;
+import java.util.TreeSet;
+
+import org.apache.commons.lang3.StringUtils;
+import org.htmlparser.Tag;
+import org.htmlparser.Text;
+import org.htmlparser.tags.ScriptTag;
+import org.htmlparser.visitors.TextExtractingVisitor;
+
+/**
+ * TODO comment / describe <br>
+ * TODO write test(s)
+ * 
+ */
+public class HtmlConverterVisitor extends TextExtractingVisitor {
+
+  private boolean inBody = false;
+
+  private boolean inScript = false;
+
+  private boolean skipWhitespace = true;
+
+  private SortedSet<HtmlConverterPSpan> textSpans = new TreeSet<HtmlConverterPSpan>();
+
+  private SortedSet<HtmlConverterPSpan> linebreaksFromHtmlTags = new TreeSet<HtmlConverterPSpan>();
+
+  private Set<String> newlineInducingTags;
+
+  public HtmlConverterVisitor(Set<String> newlineInducingTags) {
+    this.newlineInducingTags = newlineInducingTags;
+  }
+
+  @Override
+  public void visitStringNode(Text node) {
+    super.visitStringNode(node);
+    if (this.inBody && !this.inScript && (!skipWhitespace || !StringUtils.isBlank(node.getText()))) {
+      int from = node.getStartPosition();
+      int to = node.getEndPosition();
+      textSpans.add(new HtmlConverterPSpan(from, to, node.getText()));
+    }
+  }
+
+  @Override
+  public void visitTag(Tag tag) {
+    super.visitTag(tag);
+    String trimmedTagnameLowercase = tag.getTagName().toLowerCase().trim();
+    if (trimmedTagnameLowercase.equals("body")) {
+      this.inBody = true;
+    } else if (trimmedTagnameLowercase.equals("script")) {
+      this.inScript = true;
+    }
+    if (this.newlineInducingTags.contains(trimmedTagnameLowercase)) {
+      int begin = tag.getStartPosition();
+      this.linebreaksFromHtmlTags.add(new HtmlConverterPSpanReplacement(begin, begin + 1,
+              HtmlConverter.LINEBREAK));
+    }
+  }
+
+  @Override
+  public void visitEndTag(Tag tag) {
+    String tagname = tag.getTagName().toLowerCase().trim();
+    if (tagname.equals("body")) {
+      this.inBody = false;
+    } else if (tagname.equals("script") || tag instanceof ScriptTag) {
+      this.inScript = false;
+    }
+  }
+
+  public SortedSet<HtmlConverterPSpan> getTextSpans() {
+    return textSpans;
+  }
+
+  public SortedSet<HtmlConverterPSpan> getLinebreaksFromHtmlTags() {
+    return linebreaksFromHtmlTags;
+  }
+}
\ No newline at end of file

Added: uima/sandbox/textmarker/trunk/textmarker-core/src/main/resources/org/apache/uima/textmarker/engine/HtmlConverter.xml
URL: http://svn.apache.org/viewvc/uima/sandbox/textmarker/trunk/textmarker-core/src/main/resources/org/apache/uima/textmarker/engine/HtmlConverter.xml?rev=1448973&view=auto
==============================================================================
--- uima/sandbox/textmarker/trunk/textmarker-core/src/main/resources/org/apache/uima/textmarker/engine/HtmlConverter.xml (added)
+++ uima/sandbox/textmarker/trunk/textmarker-core/src/main/resources/org/apache/uima/textmarker/engine/HtmlConverter.xml Fri Feb 22 10:12:28 2013
@@ -0,0 +1,93 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+  
+  http://www.apache.org/licenses/LICENSE-2.0
+  
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+<analysisEngineDescription xmlns="http://uima.apache.org/resourceSpecifier">
+  <frameworkImplementation>org.apache.uima.java</frameworkImplementation>
+  <primitive>true</primitive>
+  <annotatorImplementationName>org.apache.uima.textmarker.engine.HtmlConverter</annotatorImplementationName>
+  <analysisEngineMetaData>
+    <name>XMLConverter</name>
+    <description/>
+    <version>1.0</version>
+    <vendor/>
+    <configurationParameters>
+      <configurationParameter>
+        <name>outputView</name>
+        <type>String</type>
+        <multiValued>false</multiValued>
+        <mandatory>false</mandatory>
+      </configurationParameter>
+      <configurationParameter>
+        <name>newlineInducingTags</name>
+        <type>String</type>
+        <multiValued>true</multiValued>
+        <mandatory>false</mandatory>
+      </configurationParameter>
+      <configurationParameter>
+        <name>replaceLinebreaks</name>
+        <type>Boolean</type>
+        <multiValued>false</multiValued>
+        <mandatory>false</mandatory>
+      </configurationParameter>
+      <configurationParameter>
+        <name>inputView</name>
+        <type>String</type>
+        <multiValued>false</multiValued>
+        <mandatory>false</mandatory>
+      </configurationParameter>
+      <configurationParameter>
+        <name>conversionPolicy</name>
+        <description>Either heuristic, explicit, or none.
+Defaults to heuristic.</description>
+        <type>String</type>
+        <multiValued>false</multiValued>
+        <mandatory>false</mandatory>
+      </configurationParameter>
+      <configurationParameter>
+        <name>conversionPatterns</name>
+        <type>String</type>
+        <multiValued>true</multiValued>
+        <mandatory>false</mandatory>
+      </configurationParameter>
+      <configurationParameter>
+        <name>conversionReplacements</name>
+        <type>String</type>
+        <multiValued>true</multiValued>
+        <mandatory>false</mandatory>
+      </configurationParameter>
+    </configurationParameters>
+    <configurationParameterSettings/>
+    <typeSystemDescription/>
+    <typePriorities/>
+    <fsIndexCollection/>
+    <capabilities>
+      <capability>
+        <inputs/>
+        <outputs/>
+        <languagesSupported/>
+      </capability>
+    </capabilities>
+    <operationalProperties>
+      <modifiesCas>true</modifiesCas>
+      <multipleDeploymentAllowed>true</multipleDeploymentAllowed>
+      <outputsNewCASes>false</outputsNewCASes>
+    </operationalProperties>
+  </analysisEngineMetaData>
+  <resourceManagerConfiguration/>
+</analysisEngineDescription>
\ No newline at end of file

Modified: uima/sandbox/textmarker/trunk/textmarker-core/src/test/java/org/apache/uima/textmarker/AllTests.java
URL: http://svn.apache.org/viewvc/uima/sandbox/textmarker/trunk/textmarker-core/src/test/java/org/apache/uima/textmarker/AllTests.java?rev=1448973&r1=1448972&r2=1448973&view=diff
==============================================================================
--- uima/sandbox/textmarker/trunk/textmarker-core/src/test/java/org/apache/uima/textmarker/AllTests.java (original)
+++ uima/sandbox/textmarker/trunk/textmarker-core/src/test/java/org/apache/uima/textmarker/AllTests.java Fri Feb 22 10:12:28 2013
@@ -23,6 +23,7 @@ import org.apache.uima.textmarker.condit
 import org.apache.uima.textmarker.condition.PartOfTest;
 import org.apache.uima.textmarker.condition.PositionTest;
 import org.apache.uima.textmarker.engine.HtmlAnnotatorTest;
+import org.apache.uima.textmarker.engine.HtmlConverterTest;
 import org.apache.uima.textmarker.engine.TextMarkerModifierTest;
 import org.apache.uima.textmarker.seed.DefaultSeederTest;
 import org.apache.uima.textmarker.verbalizer.ActionVerbalizerTest;
@@ -40,7 +41,7 @@ import org.junit.runners.Suite.SuiteClas
     AllActionsTest.class, AllConditionsTest.class, CurrentCountTest.class, PartOfTest.class,
     PositionTest.class, DefaultSeederTest.class, ConditionVerbalizerTest.class,
     ActionVerbalizerTest.class, ExpressionVerbalizerTest.class, HtmlAnnotatorTest.class,
-    EmptyDocumentTest.class, TextMarkerModifierTest.class })
+    HtmlConverterTest.class, EmptyDocumentTest.class, TextMarkerModifierTest.class })
 public class AllTests {
 
 }

Added: uima/sandbox/textmarker/trunk/textmarker-core/src/test/java/org/apache/uima/textmarker/engine/HtmlConverterTest.java
URL: http://svn.apache.org/viewvc/uima/sandbox/textmarker/trunk/textmarker-core/src/test/java/org/apache/uima/textmarker/engine/HtmlConverterTest.java?rev=1448973&view=auto
==============================================================================
--- uima/sandbox/textmarker/trunk/textmarker-core/src/test/java/org/apache/uima/textmarker/engine/HtmlConverterTest.java (added)
+++ uima/sandbox/textmarker/trunk/textmarker-core/src/test/java/org/apache/uima/textmarker/engine/HtmlConverterTest.java Fri Feb 22 10:12:28 2013
@@ -0,0 +1,544 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.uima.textmarker.engine;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.File;
+import java.io.IOException;
+import java.net.URISyntaxException;
+import java.net.URL;
+
+import org.apache.uima.UIMAFramework;
+import org.apache.uima.analysis_engine.AnalysisEngine;
+import org.apache.uima.analysis_engine.AnalysisEngineDescription;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.FSIterator;
+import org.apache.uima.cas.Type;
+import org.apache.uima.cas.text.AnnotationFS;
+import org.apache.uima.cas.text.AnnotationIndex;
+import org.apache.uima.resource.ResourceConfigurationException;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.apache.uima.resource.ResourceSpecifier;
+import org.apache.uima.resource.metadata.TypeSystemDescription;
+import org.apache.uima.textmarker.TextMarkerTestUtils;
+import org.apache.uima.util.FileUtils;
+import org.apache.uima.util.InvalidXMLException;
+import org.apache.uima.util.XMLInputSource;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+public class HtmlConverterTest {
+
+  private static String htmlWin;
+
+  private static String htmlWithComments;
+
+  private static String htmlWithCommentsAndScript;
+
+  private static String htmlUnix;
+
+  private static String htmlDecoding;
+
+  private static String htmlDecodingAnnotations;
+
+  private String outputViewName = "raw_testing";
+
+  @BeforeClass
+  public static void setUpClass() throws IOException, URISyntaxException {
+    // get resource
+    String namespace = HtmlConverterTest.class.getPackage().getName().replaceAll("\\.", "/");
+    String testFilename = namespace + "/" + "testWin.html";
+    URL textURL = HtmlConverterTest.class.getClassLoader().getResource(testFilename);
+    File textFile = new File(textURL.toURI());
+    HtmlConverterTest.htmlWin = FileUtils.file2String(textFile, "UTF-8");
+    //
+    testFilename = namespace + "/" + "testUnix.html";
+    textURL = HtmlConverterTest.class.getClassLoader().getResource(testFilename);
+    textFile = new File(textURL.toURI());
+    HtmlConverterTest.htmlUnix = FileUtils.file2String(textFile, "UTF-8");
+    //
+    testFilename = namespace + "/" + "testHtmlDecoding.html";
+    textURL = HtmlConverterTest.class.getClassLoader().getResource(testFilename);
+    textFile = new File(textURL.toURI());
+    HtmlConverterTest.htmlDecoding = FileUtils.file2String(textFile, "UTF-8");
+    //
+    testFilename = namespace + "/" + "testHtmlDecodingAnnotations.html";
+    textURL = HtmlConverterTest.class.getClassLoader().getResource(testFilename);
+    textFile = new File(textURL.toURI());
+    HtmlConverterTest.htmlDecodingAnnotations = FileUtils.file2String(textFile, "UTF-8");
+    //
+    testFilename = namespace + "/" + "testWithComments.html";
+    textURL = HtmlConverterTest.class.getClassLoader().getResource(testFilename);
+    textFile = new File(textURL.toURI());
+    HtmlConverterTest.htmlWithComments = FileUtils.file2String(textFile, "UTF-8");
+    //
+    testFilename = namespace + "/" + "testWithCommentsAndScript.html";
+    textURL = HtmlConverterTest.class.getClassLoader().getResource(testFilename);
+    textFile = new File(textURL.toURI());
+    HtmlConverterTest.htmlWithCommentsAndScript = FileUtils.file2String(textFile, "UTF-8");
+  }
+
+  @Test
+  public void htmlBodyContentHtmlDecodingExplicitPolicyTest()
+          throws AnalysisEngineProcessException, ResourceConfigurationException,
+          ResourceInitializationException, InvalidXMLException, IOException {
+    // configure annotator and create AE:
+    URL url = HtmlConverter.class.getClassLoader().getResource("HtmlConverter.xml");
+    if (url == null) {
+      url = HtmlConverter.class.getClassLoader().getResource(
+              "org/apache/uima/textmarker/engine/HtmlConverter.xml");
+    }
+    XMLInputSource in = new XMLInputSource(url);
+    ResourceSpecifier specifier = UIMAFramework.getXMLParser().parseResourceSpecifier(in);
+    AnalysisEngine ae = UIMAFramework.produceAnalysisEngine(specifier);
+    CAS cas = ae.newCAS();
+
+    ae.setConfigParameterValue(HtmlConverter.OUTPUT_VIEW, outputViewName);
+    ae.setConfigParameterValue(HtmlConverter.CONVERSION_POLICY, "explicit");
+    ae.setConfigParameterValue(HtmlConverter.CONVERSION_PATTERNS, new String[] { "&nbsp;" });
+    ae.setConfigParameterValue(HtmlConverter.CONVERSION_REPLACEMENTS, new String[] { " " });
+    ae.reconfigure();
+    cas.reset();
+    cas.setDocumentText(htmlDecoding);
+
+    // go:
+    ae.process(cas);
+
+    CAS modifiedView = cas.getView(outputViewName);
+    String text = modifiedView.getDocumentText();
+
+    String inputText = cas.getDocumentText();
+    String expectedText = "start of body\nnormal normal bold\nend of body 3&#8364;&#160;&auml;&ouml;&uuml;";
+    AnnotationFS documentAnnotation = modifiedView.getDocumentAnnotation();
+    assertEquals(String.format("begin of annotation is %d, %d expected.",
+            documentAnnotation.getBegin(), 0), documentAnnotation.getBegin(), 0);
+    assertEquals(documentAnnotation.getEnd(), text.length());
+    assertEquals(expectedText, text);
+
+    // fini
+    cas.release();
+  }
+
+  @Test
+  public void htmlBodyContentHtmlDecodingHeuristicPolicyTest()
+          throws AnalysisEngineProcessException, ResourceConfigurationException,
+          ResourceInitializationException, InvalidXMLException, IOException {
+    // configure annotator and create AE:
+    URL url = HtmlConverter.class.getClassLoader().getResource("HtmlConverter.xml");
+    if (url == null) {
+      url = HtmlConverter.class.getClassLoader().getResource(
+              "org/apache/uima/textmarker/engine/HtmlConverter.xml");
+    }
+    XMLInputSource in = new XMLInputSource(url);
+    ResourceSpecifier specifier = UIMAFramework.getXMLParser().parseResourceSpecifier(in);
+    AnalysisEngine ae = UIMAFramework.produceAnalysisEngine(specifier);
+    CAS cas = ae.newCAS();
+
+    ae.setConfigParameterValue(HtmlConverter.OUTPUT_VIEW, outputViewName);
+    ae.reconfigure();
+    cas.reset();
+    cas.setDocumentText(htmlDecoding);
+
+    // go:
+    ae.process(cas);
+
+    CAS modifiedView = cas.getView(outputViewName);
+    String text = modifiedView.getDocumentText();
+
+    String inputText = cas.getDocumentText();
+    String expectedText = "start of body\nnormal\u00A0normal bold\nend of body 3\u20AC\u00A0äöü";
+    AnnotationFS documentAnnotation = modifiedView.getDocumentAnnotation();
+    assertEquals(String.format("begin of annotation is %d, %d expected.",
+            documentAnnotation.getBegin(), 0), documentAnnotation.getBegin(), 0);
+    assertEquals(documentAnnotation.getEnd(), text.length());
+    assertEquals(expectedText, text);
+
+    // fini
+    cas.release();
+  }
+
+  @Test
+  public void htmlBodyContentUnixTest() throws AnalysisEngineProcessException,
+          ResourceConfigurationException, ResourceInitializationException, InvalidXMLException,
+          IOException {
+    // configure annotator and create AE:
+    URL url = HtmlConverter.class.getClassLoader().getResource("HtmlConverter.xml");
+    if (url == null) {
+      url = HtmlConverter.class.getClassLoader().getResource(
+              "org/apache/uima/textmarker/engine/HtmlConverter.xml");
+    }
+    XMLInputSource in = new XMLInputSource(url);
+    ResourceSpecifier specifier = UIMAFramework.getXMLParser().parseResourceSpecifier(in);
+    AnalysisEngine ae = UIMAFramework.produceAnalysisEngine(specifier);
+    CAS cas = ae.newCAS();
+
+    ae.setConfigParameterValue(HtmlConverter.OUTPUT_VIEW, outputViewName);
+    ae.reconfigure();
+    cas.reset();
+    cas.setDocumentText(htmlUnix);
+
+    // go:
+    ae.process(cas);
+
+    CAS modifiedView = cas.getView(outputViewName);
+    String text = modifiedView.getDocumentText();
+
+    String expectedText = "start of body\nnormal bold\nend of body";
+    AnnotationFS documentAnnotation = modifiedView.getDocumentAnnotation();
+    assertEquals(String.format("begin of annotation is %d, %d expected.",
+            documentAnnotation.getBegin(), 0), documentAnnotation.getBegin(), 0);
+    assertEquals(documentAnnotation.getEnd(), text.length());
+    assertEquals(expectedText, text);
+
+    // fini
+    cas.release();
+  }
+
+  @Test
+  public void htmlBodyContentWinTest() throws AnalysisEngineProcessException,
+          ResourceConfigurationException, ResourceInitializationException, InvalidXMLException,
+          IOException {
+    // configure annotator and create AE:
+    URL url = HtmlConverter.class.getClassLoader().getResource("HtmlConverter.xml");
+    if (url == null) {
+      url = HtmlConverter.class.getClassLoader().getResource(
+              "org/apache/uima/textmarker/engine/HtmlConverter.xml");
+    }
+    XMLInputSource in = new XMLInputSource(url);
+    ResourceSpecifier specifier = UIMAFramework.getXMLParser().parseResourceSpecifier(in);
+    AnalysisEngine ae = UIMAFramework.produceAnalysisEngine(specifier);
+    CAS cas = ae.newCAS();
+
+    ae.setConfigParameterValue(HtmlConverter.OUTPUT_VIEW, outputViewName);
+    ae.reconfigure();
+    cas.reset();
+    cas.setDocumentText(htmlWin);
+
+    // go:
+    ae.process(cas);
+
+    CAS modifiedView = cas.getView(outputViewName);
+    String text = modifiedView.getDocumentText();
+
+    String expectedText = "start of body\nnormal bold\nend of body";
+    AnnotationFS documentAnnotation = modifiedView.getDocumentAnnotation();
+    assertEquals(String.format("begin of annotation is %d, %d expected.",
+            documentAnnotation.getBegin(), 0), documentAnnotation.getBegin(), 0);
+    assertEquals(documentAnnotation.getEnd(), text.length());
+    assertEquals(expectedText, text);
+
+    // fini
+    cas.release();
+  }
+
+  @Test
+  public void htmlBodyContentWithCommentsTest() throws AnalysisEngineProcessException,
+          ResourceConfigurationException, ResourceInitializationException, InvalidXMLException,
+          IOException {
+    // configure annotator and create AE:
+    URL url = HtmlConverter.class.getClassLoader().getResource("HtmlConverter.xml");
+    if (url == null) {
+      url = HtmlConverter.class.getClassLoader().getResource(
+              "org/apache/uima/textmarker/engine/HtmlConverter.xml");
+    }
+    XMLInputSource in = new XMLInputSource(url);
+    ResourceSpecifier specifier = UIMAFramework.getXMLParser().parseResourceSpecifier(in);
+    AnalysisEngine ae = UIMAFramework.produceAnalysisEngine(specifier);
+    CAS cas = ae.newCAS();
+
+    ae.setConfigParameterValue(HtmlConverter.OUTPUT_VIEW, outputViewName);
+    ae.reconfigure();
+    cas.reset();
+    cas.setDocumentText(htmlWithComments);
+
+    // go:
+    ae.process(cas);
+
+    CAS modifiedView = cas.getView(outputViewName);
+    String text = modifiedView.getDocumentText();
+
+    String expectedText = "start of body\nnormal bold\nend of body";
+    AnnotationFS documentAnnotation = modifiedView.getDocumentAnnotation();
+    assertEquals(String.format("begin of annotation is %d, %d expected.",
+            documentAnnotation.getBegin(), 0), documentAnnotation.getBegin(), 0);
+    assertEquals(documentAnnotation.getEnd(), text.length());
+    assertEquals(expectedText, text);
+
+    // fini
+    cas.release();
+  }
+
+  @Test
+  public void htmlBodyContentNLTagsTest() throws AnalysisEngineProcessException,
+          ResourceConfigurationException, ResourceInitializationException, InvalidXMLException,
+          IOException {
+    // configure annotator and create AE:
+    URL url = HtmlConverter.class.getClassLoader().getResource("HtmlConverter.xml");
+    if (url == null) {
+      url = HtmlConverter.class.getClassLoader().getResource(
+              "org/apache/uima/textmarker/engine/HtmlConverter.xml");
+    }
+    XMLInputSource in = new XMLInputSource(url);
+    ResourceSpecifier specifier = UIMAFramework.getXMLParser().parseResourceSpecifier(in);
+    AnalysisEngine ae = UIMAFramework.produceAnalysisEngine(specifier);
+    CAS cas = ae.newCAS();
+
+    ae.setConfigParameterValue(HtmlConverter.OUTPUT_VIEW, outputViewName);
+    ae.setConfigParameterValue(HtmlConverter.NEWLINE_INDUCING_TAGS, new String[] { "br" });
+    ae.reconfigure();
+    cas.reset();
+    cas.setDocumentText(htmlWin);
+
+    // go:
+    ae.process(cas);
+
+    CAS modifiedView = cas.getView(outputViewName);
+    String text = modifiedView.getDocumentText();
+
+    String expectedText = "start of body\nnormal bold\nend of body";
+    AnnotationFS documentAnnotation = modifiedView.getDocumentAnnotation();
+    assertEquals(String.format("begin of annotation is %d, %d expected.",
+            documentAnnotation.getBegin(), 0), documentAnnotation.getBegin(), 0);
+    assertEquals(documentAnnotation.getEnd(), text.length());
+    assertEquals(expectedText, text);
+
+    // fini
+    cas.release();
+  }
+
+  @Test
+  public void htmlBodyContentWithCommentsAndScriptTest() throws AnalysisEngineProcessException,
+          ResourceConfigurationException, ResourceInitializationException, InvalidXMLException,
+          IOException {
+    // configure annotator and create AE:
+    URL url = HtmlConverter.class.getClassLoader().getResource("HtmlConverter.xml");
+    if (url == null) {
+      url = HtmlConverter.class.getClassLoader().getResource(
+              "org/apache/uima/textmarker/engine/HtmlConverter.xml");
+    }
+    XMLInputSource in = new XMLInputSource(url);
+    ResourceSpecifier specifier = UIMAFramework.getXMLParser().parseResourceSpecifier(in);
+    AnalysisEngine ae = UIMAFramework.produceAnalysisEngine(specifier);
+    CAS cas = ae.newCAS();
+
+    ae.setConfigParameterValue(HtmlConverter.OUTPUT_VIEW, outputViewName);
+    ae.reconfigure();
+    cas.reset();
+    cas.setDocumentText(htmlWithCommentsAndScript);
+
+    // go:
+    ae.process(cas);
+
+    CAS modifiedView = cas.getView(outputViewName);
+    String text = modifiedView.getDocumentText();
+
+    String expectedText = "start of body\nnormal bold\nend of body";
+    AnnotationFS documentAnnotation = modifiedView.getDocumentAnnotation();
+    assertEquals(String.format("begin of annotation is %d, %d expected.",
+            documentAnnotation.getBegin(), 0), documentAnnotation.getBegin(), 0);
+    assertEquals(documentAnnotation.getEnd(), text.length());
+    assertEquals(expectedText, text);
+
+    // fini
+    cas.release();
+  }
+
+  @Test
+  public void annotationPropagationTest() throws AnalysisEngineProcessException,
+          ResourceConfigurationException, ResourceInitializationException, InvalidXMLException,
+          IOException {
+    for (String htmlContent : new String[] { htmlWin }) {
+      // configure annotator and create AE:
+      URL url = HtmlConverter.class.getClassLoader().getResource("HtmlConverter.xml");
+      if (url == null) {
+        url = HtmlConverter.class.getClassLoader().getResource(
+                "org/apache/uima/textmarker/engine/HtmlConverter.xml");
+      }
+      XMLInputSource in = new XMLInputSource(url);
+      AnalysisEngineDescription specifier = (AnalysisEngineDescription) UIMAFramework
+              .getXMLParser().parseResourceSpecifier(in);
+
+      TypeSystemDescription basicTypeSystem = specifier.getAnalysisEngineMetaData().getTypeSystem();
+      basicTypeSystem.addType(HtmlAnnotator.NAMESPACE + "A", "Type for Testing",
+              "uima.tcas.Annotation");
+      basicTypeSystem.addType(HtmlAnnotator.NAMESPACE + "I", "Type for Testing",
+              "uima.tcas.Annotation");
+      basicTypeSystem.addType(HtmlAnnotator.NAMESPACE + "B", "Type for Testing",
+              "uima.tcas.Annotation");
+      basicTypeSystem.addType(HtmlAnnotator.NAMESPACE + "U", "Type for Testing",
+              "uima.tcas.Annotation");
+      specifier.getAnalysisEngineMetaData().setTypeSystem(basicTypeSystem);
+
+      AnalysisEngine ae = UIMAFramework.produceAnalysisEngine(specifier);
+      ae.setConfigParameterValue("outputView", outputViewName);
+      ae.reconfigure();
+
+      // create the cas and input annotation
+      CAS cas = ae.newCAS();
+      cas.reset();
+      cas.setDocumentText(htmlContent);
+      Type boldType = cas.getTypeSystem().getType(HtmlAnnotator.NAMESPACE + "B");
+      AnnotationFS fs = cas.createAnnotation(boldType, 78, 89);
+      cas.addFsToIndexes(fs);
+
+      // go:
+      ae.process(cas);
+
+      // test:
+      CAS modifiedView = cas.getView(outputViewName);
+      AnnotationIndex<AnnotationFS> ai = modifiedView.getAnnotationIndex(boldType);
+      FSIterator<AnnotationFS> iterator = ai.iterator();
+      assertEquals(true, iterator.hasNext());
+      AnnotationFS next = iterator.next();
+      assertEquals("B", next.getType().getShortName());
+      assertEquals(21, next.getBegin());
+      assertEquals(25, next.getEnd());
+      assertEquals(next.getCoveredText(), "bold");
+      assertEquals(next.getCoveredText().length(), 4);
+
+      // fini
+      cas.release();
+
+    }
+
+  }
+
+  @Test
+  public void annotationPropagationAndDecodingTest() throws AnalysisEngineProcessException,
+          ResourceConfigurationException, ResourceInitializationException, InvalidXMLException,
+          IOException {
+    // configure annotator and create AE:
+    URL url = HtmlConverter.class.getClassLoader().getResource("HtmlConverter.xml");
+    if (url == null) {
+      url = HtmlConverter.class.getClassLoader().getResource(
+              "org/apache/uima/textmarker/engine/HtmlConverter.xml");
+    }
+    XMLInputSource in = new XMLInputSource(url);
+    AnalysisEngineDescription specifier = (AnalysisEngineDescription) UIMAFramework
+            .getXMLParser().parseResourceSpecifier(in);
+
+    TypeSystemDescription basicTypeSystem = specifier.getAnalysisEngineMetaData().getTypeSystem();
+    basicTypeSystem.addType(HtmlAnnotator.NAMESPACE + "A", "Type for Testing",
+            "uima.tcas.Annotation");
+    basicTypeSystem.addType(HtmlAnnotator.NAMESPACE + "I", "Type for Testing",
+            "uima.tcas.Annotation");
+    basicTypeSystem.addType(HtmlAnnotator.NAMESPACE + "B", "Type for Testing",
+            "uima.tcas.Annotation");
+    basicTypeSystem.addType(HtmlAnnotator.NAMESPACE + "U", "Type for Testing",
+            "uima.tcas.Annotation");
+    specifier.getAnalysisEngineMetaData().setTypeSystem(basicTypeSystem);
+    
+    AnalysisEngine ae = UIMAFramework.produceAnalysisEngine(specifier);
+    ae.setConfigParameterValue("outputView", outputViewName);
+    ae.reconfigure();
+
+    // create the cas and input annotation
+    CAS cas = ae.newCAS();
+    cas.reset();
+    cas.setDocumentText(htmlDecodingAnnotations);
+    Type boldType = cas.getTypeSystem().getType(HtmlAnnotator.NAMESPACE + "B");
+    AnnotationFS fs1 = cas.createAnnotation(boldType, 210, 221);
+    AnnotationFS fs2 = cas.createAnnotation(boldType, 279, 290);
+    cas.addFsToIndexes(fs1);
+    cas.addFsToIndexes(fs2);
+
+    // go:
+    ae.process(cas);
+
+    // test: should be:
+    //
+    // start of body\nnormal normal bold\nend of body 3€ äöü and bold.
+    // 01234567890123 4567890123456789012 34567890123456789012345678901234567890
+    // 0 ------- 1 -------- 2 ------- 3 -------- 4 ------- 5 ------- 6 ------- 7
+    // ____________________________ <--> _______________________ <--> __________
+    // _____________________________ BB _________________________ BB ___________
+    //
+    CAS modifiedView = cas.getView(outputViewName);
+    String modifiedText = modifiedView.getDocumentText();
+    int modLength = modifiedText.length();
+    int modLengthCodepoints = modifiedText.codePointCount(0, modLength);
+    AnnotationIndex<AnnotationFS> ai = modifiedView.getAnnotationIndex(boldType);
+    FSIterator<AnnotationFS> iterator = ai.iterator();
+    assertEquals(true, iterator.hasNext());
+    // check first bold annotation
+    AnnotationFS next = iterator.next();
+    assertEquals("B", next.getType().getShortName());
+    assertEquals(28, next.getBegin());
+    assertEquals(32, next.getEnd());
+    assertEquals(next.getCoveredText(), "bold");
+    assertEquals(next.getCoveredText().length(), 4);
+    // check second bold annotation
+    next = iterator.next();
+    int begin = next.getBegin();
+    int end = next.getEnd();
+    assertEquals("B", next.getType().getShortName());
+    assertEquals(56, begin); // map[279] == 56
+    assertEquals(60, end); // map[290] == 60
+    assertEquals(next.getCoveredText(), "bold");
+    assertEquals(next.getCoveredText().length(), 4);
+
+    // fini
+    cas.release();
+
+  }
+
+  @Test
+  public void parameterTestInputView() throws AnalysisEngineProcessException,
+          ResourceConfigurationException, ResourceInitializationException, InvalidXMLException,
+          IOException {
+    // configure annotator and create AE:
+    URL url = HtmlConverter.class.getClassLoader().getResource("HtmlConverter.xml");
+    if (url == null) {
+      url = HtmlConverter.class.getClassLoader().getResource(
+              "org/apache/uima/textmarker/engine/HtmlConverter.xml");
+    }
+    XMLInputSource in = new XMLInputSource(url);
+    ResourceSpecifier specifier = UIMAFramework.getXMLParser().parseResourceSpecifier(in);
+    AnalysisEngine ae = UIMAFramework.produceAnalysisEngine(specifier);
+    CAS cas = ae.newCAS();
+
+    String inputViewName = "inview";
+
+    ae.setConfigParameterValue(HtmlConverter.INPUT_VIEW, inputViewName);
+    ae.setConfigParameterValue(HtmlConverter.OUTPUT_VIEW, outputViewName);
+    ae.reconfigure();
+    cas.reset();
+    CAS inview = cas.createView(inputViewName);
+    inview.setDocumentText(htmlUnix);
+
+    // go:
+    ae.process(cas);
+
+    CAS modifiedView = cas.getView(outputViewName);
+    String text = modifiedView.getDocumentText();
+
+    String expectedText = "start of body\nnormal bold\nend of body";
+    AnnotationFS documentAnnotation = modifiedView.getDocumentAnnotation();
+    assertEquals(String.format("begin of annotation is %d, %d expected.",
+            documentAnnotation.getBegin(), 0), documentAnnotation.getBegin(), 0);
+    assertEquals(documentAnnotation.getEnd(), text.length());
+    assertEquals(expectedText, text);
+
+    // fini
+    cas.release();
+  }
+}

Added: uima/sandbox/textmarker/trunk/textmarker-core/src/test/resources/org/apache/uima/textmarker/engine/testHtmlDecoding.html
URL: http://svn.apache.org/viewvc/uima/sandbox/textmarker/trunk/textmarker-core/src/test/resources/org/apache/uima/textmarker/engine/testHtmlDecoding.html?rev=1448973&view=auto
==============================================================================
--- uima/sandbox/textmarker/trunk/textmarker-core/src/test/resources/org/apache/uima/textmarker/engine/testHtmlDecoding.html (added)
+++ uima/sandbox/textmarker/trunk/textmarker-core/src/test/resources/org/apache/uima/textmarker/engine/testHtmlDecoding.html Fri Feb 22 10:12:28 2013
@@ -0,0 +1,15 @@
+<html>
+<head>
+<!--in head-->
+</head>
+<body>
+start of <script type="text/javascript">function testfun () {
+  var Jetzt = new Date();
+</script>body
+<!--a comment in the body-->
+<br/>
+normal&nbsp;normal <b>bold</b>
+<br/>
+end of body 3&#8364;&#160;&auml;&ouml;&uuml;
+</body>
+</html>
\ No newline at end of file

Added: uima/sandbox/textmarker/trunk/textmarker-core/src/test/resources/org/apache/uima/textmarker/engine/testHtmlDecodingAnnotations.html
URL: http://svn.apache.org/viewvc/uima/sandbox/textmarker/trunk/textmarker-core/src/test/resources/org/apache/uima/textmarker/engine/testHtmlDecodingAnnotations.html?rev=1448973&view=auto
==============================================================================
--- uima/sandbox/textmarker/trunk/textmarker-core/src/test/resources/org/apache/uima/textmarker/engine/testHtmlDecodingAnnotations.html (added)
+++ uima/sandbox/textmarker/trunk/textmarker-core/src/test/resources/org/apache/uima/textmarker/engine/testHtmlDecodingAnnotations.html Fri Feb 22 10:12:28 2013
@@ -0,0 +1,15 @@
+<html>
+<head>
+<!--in head-->
+</head>
+<body>
+start of <script type="text/javascript">function testfun () {
+  var Jetzt = new Date();
+</script>body
+<!--a comment in the body-->
+<br/>
+normal&nbsp;normal <b>bold</b>
+<br/>
+end of body 3&#8364;&#160;&auml;&ouml;&uuml; and <b>bold</b>.
+</body>
+</html>
\ No newline at end of file

Added: uima/sandbox/textmarker/trunk/textmarker-core/src/test/resources/org/apache/uima/textmarker/engine/testUnix.html
URL: http://svn.apache.org/viewvc/uima/sandbox/textmarker/trunk/textmarker-core/src/test/resources/org/apache/uima/textmarker/engine/testUnix.html?rev=1448973&view=auto
==============================================================================
--- uima/sandbox/textmarker/trunk/textmarker-core/src/test/resources/org/apache/uima/textmarker/engine/testUnix.html (added)
+++ uima/sandbox/textmarker/trunk/textmarker-core/src/test/resources/org/apache/uima/textmarker/engine/testUnix.html Fri Feb 22 10:12:28 2013
@@ -0,0 +1,12 @@
+<html>
+<head>
+<!--in head-->
+</head>
+<body>
+start of body
+<br/>
+normal <b>bold</b>
+<br/>
+end of body
+</body>
+</html>
\ No newline at end of file

Added: uima/sandbox/textmarker/trunk/textmarker-core/src/test/resources/org/apache/uima/textmarker/engine/testWin.html
URL: http://svn.apache.org/viewvc/uima/sandbox/textmarker/trunk/textmarker-core/src/test/resources/org/apache/uima/textmarker/engine/testWin.html?rev=1448973&view=auto
==============================================================================
--- uima/sandbox/textmarker/trunk/textmarker-core/src/test/resources/org/apache/uima/textmarker/engine/testWin.html (added)
+++ uima/sandbox/textmarker/trunk/textmarker-core/src/test/resources/org/apache/uima/textmarker/engine/testWin.html Fri Feb 22 10:12:28 2013
@@ -0,0 +1,12 @@
+<html>
+<head>
+<!--in head-->
+</head>
+<body>
+start of body
+<br/>
+normal <b>bold</b>
+<br/>
+end of body
+</body>
+</html>
\ No newline at end of file

Added: uima/sandbox/textmarker/trunk/textmarker-core/src/test/resources/org/apache/uima/textmarker/engine/testWithComments.html
URL: http://svn.apache.org/viewvc/uima/sandbox/textmarker/trunk/textmarker-core/src/test/resources/org/apache/uima/textmarker/engine/testWithComments.html?rev=1448973&view=auto
==============================================================================
--- uima/sandbox/textmarker/trunk/textmarker-core/src/test/resources/org/apache/uima/textmarker/engine/testWithComments.html (added)
+++ uima/sandbox/textmarker/trunk/textmarker-core/src/test/resources/org/apache/uima/textmarker/engine/testWithComments.html Fri Feb 22 10:12:28 2013
@@ -0,0 +1,32 @@
+<html>
+<head>
+<!--in head-->
+</head>
+<body>
+start of body
+<!--a comment in the body
+very 
+very 
+
+
+looooooooooooooooooooooooooooong 
+
+looooooooooooooooooooooooooooong looooooooooooooooooooooooooooong 
+
+looooooooooooooooooooooooooooonglooooooooooooooooooooooooooooonglooooooooooooooooooooooooooooong
+
+looooooooooooooooooooooooooooonglooooooooooooooooooooooooooooonglooooooooooooooooooooooooooooonglooooooooooooooooooooooooooooong
+looooooooooooooooooooooooooooonglooooooooooooooooooooooooooooonglooooooooooooooooooooooooooooonglooooooooooooooooooooooooooooong
+looooooooooooooooooooooooooooonglooooooooooooooooooooooooooooonglooooooooooooooooooooooooooooonglooooooooooooooooooooooooooooong
+looooooooooooooooooooooooooooonglooooooooooooooooooooooooooooonglooooooooooooooooooooooooooooonglooooooooooooooooooooooooooooong
+looooooooooooooooooooooooooooonglooooooooooooooooooooooooooooonglooooooooooooooooooooooooooooonglooooooooooooooooooooooooooooong
+looooooooooooooooooooooooooooonglooooooooooooooooooooooooooooonglooooooooooooooooooooooooooooonglooooooooooooooooooooooooooooong
+looooooooooooooooooooooooooooonglooooooooooooooooooooooooooooonglooooooooooooooooooooooooooooonglooooooooooooooooooooooooooooong
+
+a very looooooooooooooooooooooooooooong comment-->
+<br/>
+normal <b>bold</b>
+<br/>
+end of body
+</body>
+</html>
\ No newline at end of file

Added: uima/sandbox/textmarker/trunk/textmarker-core/src/test/resources/org/apache/uima/textmarker/engine/testWithCommentsAndScript.html
URL: http://svn.apache.org/viewvc/uima/sandbox/textmarker/trunk/textmarker-core/src/test/resources/org/apache/uima/textmarker/engine/testWithCommentsAndScript.html?rev=1448973&view=auto
==============================================================================
--- uima/sandbox/textmarker/trunk/textmarker-core/src/test/resources/org/apache/uima/textmarker/engine/testWithCommentsAndScript.html (added)
+++ uima/sandbox/textmarker/trunk/textmarker-core/src/test/resources/org/apache/uima/textmarker/engine/testWithCommentsAndScript.html Fri Feb 22 10:12:28 2013
@@ -0,0 +1,15 @@
+<html>
+<head>
+<!--in head-->
+</head>
+<body>
+start of <script type="text/javascript">function testfun () {
+  var Jetzt = new Date();
+</script>body
+<!--a comment in the body-->
+<br/>
+normal <b>bold</b>
+<br/>
+end of body
+</body>
+</html>
\ No newline at end of file

Modified: uima/sandbox/textmarker/trunk/textmarker-docbook/src/docbook/tools.textmarker.overview.xml
URL: http://svn.apache.org/viewvc/uima/sandbox/textmarker/trunk/textmarker-docbook/src/docbook/tools.textmarker.overview.xml?rev=1448973&r1=1448972&r2=1448973&view=diff
==============================================================================
--- uima/sandbox/textmarker/trunk/textmarker-docbook/src/docbook/tools.textmarker.overview.xml (original)
+++ uima/sandbox/textmarker/trunk/textmarker-docbook/src/docbook/tools.textmarker.overview.xml Fri Feb 22 10:12:28 2013
@@ -1062,7 +1062,7 @@ ae.process(cas);]]></programlisting>
       </section>
     </section>
     <section id="ugr.tools.tm.ae.html">
-      <title>HMTL Annotator</title>
+      <title>HTML Annotator</title>
       <para>
         This Analysis Engine provides support for HTML files by adding annotations for the HTML elements. Using the default values, the HTML Annotator creates annotations
         for each HTML element spanning the content of the element, whereas the most common elements are represented by own types. 
@@ -1084,7 +1084,84 @@ ae.process(cas);]]></programlisting>
         </section>
       </section>
     </section>
-    <section id="ugr.tools.tm.ae.stylemap">
+    <section id="ugr.tools.tm.ae.htmlconverter">
+      <title>HTML Converter</title>
+      <para>
+        This Analysis Engine is able to convert html content from a source view into a plain string representation stored in an output view. 
+        Especially, the Analysis Engine transfers annotations under consideration of the changed document text and annotation offsets in the new view. 
+        The copy process also sets features, however, features of type annotation are currently not supported. 
+        Note that if an annotation would have the same start and end positions in the new view, i.e., 
+        if it would be mapped to an annotation of length 0, it is not moved to the new view.
+        
+        The HTML Converter also supports heuristic and explicit conversion patterns which default to html4 decoding, 
+        e.g., "<![CDATA[&nbsp;]]>", "<![CDATA[&lt;]]>", etc. Concepts like tables or lists are not supported.
+        
+        Note that in general it is suggested to run an html cleaner before any further processing to avoid problems with malformed html. 
+        
+        A descriptor file for this Analysis Engine is located in the folder <quote>descriptor/utils</quote> of a TextMarker project.
+      </para>
+      <section id="ugr.tools.tm.ae.htmlconverter.parameter">
+        <title>Configuration Parameters</title>
+        <para>
+        </para>
+        <section id="ugr.tools.tm.ae.htmlconverter.parameter.outputView">
+          <title>outputView</title>
+          <para>
+            This string parameter specifies the name of the new view. 
+            The default value is <quote>plaintext</quote>.
+          </para>
+        </section>
+        <section id="ugr.tools.tm.ae.htmlconverter.parameter.inputView">
+          <title>inputView</title>
+          <para>
+            This string parameter can optionally be set to specify the name of the input view.
+          </para>
+        </section>
+        <section id="ugr.tools.tm.ae.htmlconverter.parameter.newlineInducingTags">
+          <title>newlineInducingTags</title>
+          <para>
+            This string array parameter sets the names of the html tags that create linebreaks in the output view.
+            The default is <quote>br, p, div, ul, ol, dl, li, h1, ..., h6, blockquote</quote>.
+          </para>
+        </section>
+        <section id="ugr.tools.tm.ae.htmlconverter.parameter.replaceLinebreaks">
+          <title>replaceLinebreaks</title>
+          <para>
+            This boolean parameter determines if linebreaks inside the text nodes are kept or removed.
+            The default behaviour is <quote>true</quote>.
+          </para>
+        </section>
+        <section id="ugr.tools.tm.ae.htmlconverter.parameter.conversionPolicy">
+          <title>conversionPolicy</title>
+          <para>
+            This string parameter determines the conversion policy used, either "heuristic", "explicit", or "none". 
+            When the value is "explicit", the parameters <quote>conversionPatterns</quote> and optionally <quote>conversionReplacements</quote> are considered.
+            The "heuristic" conversion policy uses simple regular expressions to decode html4 entities such as "<![CDATA[&nbsp;]]>".
+            The default behaviour is "heuristic".
+          </para>
+        </section>
+        <section id="ugr.tools.tm.ae.htmlconverter.parameter.conversionPatterns">
+          <title>conversionPatterns</title>
+          <para>
+            This string array parameter can be used to apply custom conversions.
+            It defaults to a list of commonly used codes, e.g., <![CDATA[&nbsp;]]>, 
+            which are converted using html 4 entity unescaping. 
+            However, explicit conversion strings can also be passed via the parameter <quote>conversionReplacements</quote>.
+            Remember to enable explicit conversion via <quote>conversionPolicy</quote> first.
+          </para>
+        </section>
+        <section id="ugr.tools.tm.ae.htmlconverter.parameter.conversionReplacements">
+          <title>conversionReplacements</title>
+          <para>
+            This string array parameter corresponds to <quote>conversionPatterns</quote> 
+            such that <quote>conversionPatterns[i]</quote> will be replaced by <quote>conversionReplacements[i]</quote>; 
+            replacements should be shorter than the source pattern.
+            Per default, the replacement strings are computed using Html4 decoding.
+            Remember to enable explicit conversion via <quote>conversionPolicy</quote> first.
+          </para>
+        </section>
+      </section>
+    </section><section id="ugr.tools.tm.ae.stylemap">
       <title>Style Map Creator</title>
       <para>
         This Analysis Engine can be utilized to create style map information, which is needed by the Modifier Analysis Engine in order to create
@@ -1134,5 +1211,6 @@ ae.process(cas);]]></programlisting>
         </section>
       </section>
     </section>
+    
   </section>
 </chapter>
\ No newline at end of file

Modified: uima/sandbox/textmarker/trunk/textmarker-ep-ide/src/main/java/org/apache/uima/textmarker/ide/ui/wizards/TextMarkerProjectCreationWizard.java
URL: http://svn.apache.org/viewvc/uima/sandbox/textmarker/trunk/textmarker-ep-ide/src/main/java/org/apache/uima/textmarker/ide/ui/wizards/TextMarkerProjectCreationWizard.java?rev=1448973&r1=1448972&r2=1448973&view=diff
==============================================================================
--- uima/sandbox/textmarker/trunk/textmarker-ep-ide/src/main/java/org/apache/uima/textmarker/ide/ui/wizards/TextMarkerProjectCreationWizard.java (original)
+++ uima/sandbox/textmarker/trunk/textmarker-ep-ide/src/main/java/org/apache/uima/textmarker/ide/ui/wizards/TextMarkerProjectCreationWizard.java Fri Feb 22 10:12:28 2013
@@ -153,6 +153,7 @@ public class TextMarkerProjectCreationWi
     copy(utilsDir, "PlainTextTypeSystem.xml");
     copy(utilsDir, "HtmlAnnotator.xml");
     copy(utilsDir, "HtmlTypeSystem.xml");
+    copy(utilsDir, "HtmlConverter.xml");
   }
 
   private void copy(File dir, String fileName) {