You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@uima.apache.org by mb...@apache.org on 2007/09/12 14:22:25 UTC

svn commit: r574911 - in /incubator/uima/sandbox/trunk/RegularExpressionAnnotator: build_documentation.xml docbook/ docbook/RegexAnnotatorUserGuide/ docbook/RegexAnnotatorUserGuide/images/ docbook/RegexAnnotatorUserGuide/regexAnnotatorUserGuide.xml

Author: mbaessler
Date: Wed Sep 12 05:22:24 2007
New Revision: 574911

URL: http://svn.apache.org/viewvc?rev=574911&view=rev
Log:
UIMA-555

update RegexAnnotator documentation

https://issues.apache.org/jira/browse/UIMA-555

Added:
    incubator/uima/sandbox/trunk/RegularExpressionAnnotator/build_documentation.xml
    incubator/uima/sandbox/trunk/RegularExpressionAnnotator/docbook/
    incubator/uima/sandbox/trunk/RegularExpressionAnnotator/docbook/RegexAnnotatorUserGuide/
    incubator/uima/sandbox/trunk/RegularExpressionAnnotator/docbook/RegexAnnotatorUserGuide/images/
    incubator/uima/sandbox/trunk/RegularExpressionAnnotator/docbook/RegexAnnotatorUserGuide/regexAnnotatorUserGuide.xml

Added: incubator/uima/sandbox/trunk/RegularExpressionAnnotator/build_documentation.xml
URL: http://svn.apache.org/viewvc/incubator/uima/sandbox/trunk/RegularExpressionAnnotator/build_documentation.xml?rev=574911&view=auto
==============================================================================
--- incubator/uima/sandbox/trunk/RegularExpressionAnnotator/build_documentation.xml (added)
+++ incubator/uima/sandbox/trunk/RegularExpressionAnnotator/build_documentation.xml Wed Sep 12 05:22:24 2007
@@ -0,0 +1,30 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one
+   or more contributor license agreements.  See the NOTICE file
+   distributed with this work for additional information
+   regarding copyright ownership.  The ASF licenses this file
+   to you under the Apache License, Version 2.0 (the
+   "License"); you may not use this file except in compliance
+   with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing,
+   software distributed under the License is distributed on an
+   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+   KIND, either express or implied.  See the License for the
+   specific language governing permissions and limitations
+   under the License.    
+-->
+
+<!-- set the basedir value to point to the top level of the project -->
+
+<project name="Apache UIMA Sandbox Documentation" default="all" basedir=".">
+  
+  <property name="book_name" value="RegexAnnotatorUserGuide"/>
+	
+  <import file="${basedir}/../SandboxDocs/sandbox_build.xml"/>  
+  
+</project>

Added: incubator/uima/sandbox/trunk/RegularExpressionAnnotator/docbook/RegexAnnotatorUserGuide/regexAnnotatorUserGuide.xml
URL: http://svn.apache.org/viewvc/incubator/uima/sandbox/trunk/RegularExpressionAnnotator/docbook/RegexAnnotatorUserGuide/regexAnnotatorUserGuide.xml?rev=574911&view=auto
==============================================================================
--- incubator/uima/sandbox/trunk/RegularExpressionAnnotator/docbook/RegexAnnotatorUserGuide/regexAnnotatorUserGuide.xml (added)
+++ incubator/uima/sandbox/trunk/RegularExpressionAnnotator/docbook/RegexAnnotatorUserGuide/regexAnnotatorUserGuide.xml Wed Sep 12 05:22:24 2007
@@ -0,0 +1,1027 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE chapter PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN"
+"http://www.oasis-open.org/docbook/xml/4.5/docbookx.dtd" [
+<!ENTITY imgroot "./images/" >
+<!ENTITY % xinclude SYSTEM "../../../uima-docbook-tool/xinclude.mod">
+  %xinclude;
+]>
+<!--
+	Licensed to the Apache Software Foundation (ASF) under one
+	or more contributor license agreements.  See the NOTICE file
+	distributed with this work for additional information
+	regarding copyright ownership.  The ASF licenses this file
+	to you under the Apache License, Version 2.0 (the
+	"License"); you may not use this file except in compliance
+	with the License.  You may obtain a copy of the License at
+	
+	http://www.apache.org/licenses/LICENSE-2.0
+	
+	Unless required by applicable law or agreed to in writing,
+	software distributed under the License is distributed on an
+	"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+	KIND, either express or implied.  See the License for the
+	specific language governing permissions and limitations
+	under the License.
+-->
+
+<book lang="en">
+
+<title>Apache UIMA RegexAnnotator Documentation</title>
+
+<xi:include xmlns:xi="http://www.w3.org/2001/XInclude" href="../../../SandboxDocs/src/docbook/book_info.xml"/>	
+
+<chapter id="sandbox.regexAnnotator">
+	<title>Regular Expression Annotator</title>
+
+	<para>
+		The Regular Expression Annotator (RegexAnnotator) is an Apache
+		UIMA analysis engine that detects entities based on regular
+		expressions or concepts. A regular expression describe precise
+		patterns that are looked for in the document text. A concepts in
+		the current sense is a set of regular expressions that work
+		together to detect a more complex entity. The defined regular
+		expressions or concepts are used to detect entities like
+		numbers, email addresses or URLs and create annotations for
+		them.
+	</para>
+
+	<section id="sandbox.regexAnnotator.processingOverview">
+		<title>Processing Overview</title>
+		<para>
+			To detect entities the RegexAnnotator must be configured
+			using an external XML file. We call this file concepts file
+			since it contains the concepts and regular expression rules
+			that the annotator use to detect the entities. This
+			configuration contains additional to the rules and concepts
+			also the annotations that should be created if an entity was
+			found in the document text. The types and features used to
+			create the annotations must be defined in the UIMA type
+			system.
+		</para>
+		<para>
+			After the configuration is done, the RegexAnnotator is ready
+			to use. During is initialization is reads the concepts file
+			and checks if all rules and concepts are valid and if all
+			annotations types are defined. If no error exists the
+			processing can start. During the processing the rules are
+			processed in the same order as defined in the concepts xml
+			document. The results of a preceding rule can be used for
+			the following one.
+		</para>
+	</section>
+
+	<section id="sandbox.regexAnnotator.conceptsFile">
+		<title>Concepts Configuration File</title>
+		<para>
+			The RegexAnnotator can be configured using two levels of
+			complexity.
+		</para>
+		<para>
+			The RuleSet definition is the simple way to define rules
+			that can consists of a regular expression pattern and of
+			annotations that should be created if the rules match an
+			entity.
+		</para>
+		<para>
+			The Concept definition is the more complex way to define
+			rules that consists of more than one regular expression rule
+			that are combined together.
+		</para>
+		<para>
+			The syntax in both definitions is the same, so you don't
+			need to learn two configuration possibilities it is just to
+			have an easier way to configure the annotator for simpler
+			entities. Furthermore it is possible to extend the RuleSet
+			definition with more and more features so that it becomes a
+			real Concept definition.
+		</para>
+
+		<section id="sandbox.regexAnnotator.conceptsFile.rules">
+			<title>RuleSet definition</title>
+			<para>The RuleSet definition looks like:</para>
+			<para>
+
+				<programlisting><![CDATA[
+<conceptSet xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="concept.xsd">
+
+  <concept name="RuleSetDefinitionExample">
+    <rules>
+      <rule regEx="PatternExample" matchStrategy="matchAll" matchType="uima.tcas.DocumentAnnotation"/>
+    </rules>
+    <createAnnotations>
+      <annotation id="MyAnnotation" type="org.apache.uima.MyAnnotation">
+        <begin group="0"/>
+        <end group="0"/>
+      </annotation>
+    </createAnnotations>
+  </concept>
+
+</conceptSet>
+]]></programlisting>
+			</para>
+			<para>
+				The RuleSet definition above defines are simple concept
+				with the name "RuleSetDefinitionExample". The rule use
+				the "PatternExample" pattern that is matched on the
+				covered text of the uima.tcas.DocumentAnnotation. As
+				match strategy, "matchAll" is used that means that all
+				matches for the pattern are used to create the
+				annotations defined in the
+				<code>&lt;createAnnotations></code>
+				element. So for each match a
+				org.apache.uima.MyAnnotation annotation is created that
+				covers the match in the document text.
+			</para>
+			<para>
+				For more advanced configuration possibilities, please
+				refer to the advanced configuration below.
+			</para>
+		</section>
+
+		<section id="sandbox.regexAnnotator.conceptsFile.concepts">
+			<title>Concept definition</title>
+			<para>The concept definition looks like:</para>
+			<para>
+			
+			<programlisting><![CDATA[
+<conceptSet xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="concept.xsd">
+
+  <concept name="complexConceptExample">
+    <rules>
+      <rule ruleId="Id1" regEx="PatternExample1" matchStrategy="matchAll" matchType="uima.tcas.DocumentAnnotation" confidence="1.0"/>
+      <rule ruleId="Id2" regEx="PatternExample2" matchStrategy="matchAll" matchType="uima.tcas.DocumentAnnotation" confidence="0.7"/>
+      <rule ruleId="Id3" regEx="PatternExample3" matchStrategy="matchAll" matchType="uima.tcas.DocumentAnnotation" confidence="0.3"/>
+    </rules>
+    <createAnnotations>	
+      <annotation id="MyAnnotation1" type="org.apache.uima.MyAnnotation1">
+        <begin group="0"/>
+        <end group="0"/>
+        <setFeature name="confidenceValue" type="Confidence"/>
+        <setFeature name="ruleId" type="RuleId"/>
+      </annotation>
+    <createAnnotations>
+  </concept>
+
+</conceptSet>
+]]></programlisting>
+				
+			</para>
+			<para>
+				As you can see the concept definition is a more complex
+				RuleSet definition. The main differences are the ruleID
+				and confidence features for a rule. If these features
+				are specified, the feature values can be used as
+				annotation feature values when the
+				org.apache.uima.MyAnnotation1 is created. But lets see
+				how these concept is processed.
+			</para>
+			<para>
+				The concept processing depends on a parameter setting
+				for the RegexAnnotator. The parameter to control the
+				processing is called
+				<code>ProcessAllConceptRules</code>
+				. By default this parameter is set to
+				<code>false</code>
+				what means that the concept processing starts with the
+				first rule. If this rule found any match that triggers
+				to create an annotation the concept processing stops and
+				the other rules are not used. If the first rule doesn't
+				find a match, the next rule is used. This strategy is
+				used until a annotation is found or all rules are
+				processed. If the parameter
+				<code>ProcessAllConceptRules</code>
+				is set to
+				<code>true</code>
+				all rules are processed independent of the matches of a
+				rule.
+			</para>
+			<para>
+				If for a rule an annotations is created that has a
+				<code>&lt;setFeature></code>
+				definition of type
+				<code>Confidence</code>
+				or
+				<code>RuleId</code>
+				the current ruleId and confidence value of the rule is
+				added as feature value to the created annotations. Doing
+				this helps you after the text is processed to make
+				reliable statements about the confidence of your
+				annotation.
+			</para>
+			<note>
+				<para>
+					The features for
+					<code>Confidence</code>
+					and
+					<code>RuleId</code>
+					must be defined by yourself in the UIMA type system.
+					So you can also assign the confidence or ruleId to
+					any other feature you have defined in the UIMA type
+					system. Confidence features have to be of type
+					uima.cas.Float and RuleId features have to be of
+					type uima.cas.String.
+				</para>
+			</note>
+
+		</section>
+
+		<section
+			id="sandbox.regexAnnotator.conceptsFile.rulesDefinition">
+			<title>Rule Definition</title>
+			<para>
+				This paragraph shows in details how a rule is defined
+				and what are the advanced configuration possibilities
+				for the rule processing.
+			</para>
+			<para>
+				The listing below shows a complex rule definition with
+				all the possible features and details. Please refer to
+				the sub sections for some details.
+			</para>
+			<para>
+				 
+					<programlisting><![CDATA[
+<rule ruleId="ID1" regEx="TestRegex" matchStrategy="matchAll" matchType="uima.tcas.DocumentAnnotation" confidence="1.0">
+
+  <matchTypeFilter>
+    <feature name="language">en</feature>
+  </matchTypeFilter>
+
+  <updateMatchTypeAnnotation>
+    <setFeature name="language" type="String">$0</setFeature>
+  </updateMatchTypeAnnotation>	
+
+  <ruleExceptions>	
+    <exception matchType="uima.tcas.DocumentAnnotation">Exception</exception>
+  </ruleExceptions>
+
+</rule>
+]]></programlisting>
+				
+			</para>
+			<section
+				id="sandbox.regexAnnotator.conceptsFile.rulesDefinition.rule">
+				<title>Rule Definition Details</title>
+				<para>
+					The
+					<code>&lt;rule></code>
+					definition has three mandatory features, these are:
+				</para>
+				<para>
+					<itemizedlist>
+						<listitem>
+							<para>
+								<code>regEx</code>
+								- The regular expression pattern that
+								should be used for this rule using the
+								Java regular expression syntax.
+							</para>
+						</listitem>
+						<listitem>
+							<para>
+								<code>matchStrategy</code>
+								- The match strategy that should be used
+								for this rule. Possible values are
+								<code>matchAll</code>
+								to get all matches,
+								<code>matchFirst</code>
+								to get the first match and
+								<code>matchComplete</code>
+								to get only matches if the whole input
+								text matches the regEx pattern.
+							</para>
+						</listitem>
+						<listitem>
+							<para>
+								<code>matchType</code>
+								- As match type the annotation type have
+								to be specified where the covered text
+								should be used as input text for the
+								regEx pattern.
+							</para>
+						</listitem>
+					</itemizedlist>
+				</para>
+				<para>
+					Additionally the
+					<code>&lt;rule></code>
+					definition also has some optional features that can
+					be set, these are:
+				</para>
+				<itemizedlist>
+					<listitem>
+						<para>
+							<code>ruleId</code>
+							- Specifies an unique ID for the rule. This
+							ID value can later be used to add it as
+							value to an annotation feature (see
+							<code>&lt;setFeature></code>
+							).
+						</para>
+					</listitem>
+					<listitem>
+						<para>
+							<code>confidence</code>
+							- Specifies the confidence value of this
+							rule. Maybe you have more than one rule and
+							use different patterns to describe the same
+							entity, so you can classify the rules with
+							a confidence value. This confidence value
+							can later be used to add it as value to an
+							annotation feature (see
+							<code>&lt;setFeature></code>
+							).
+						</para>
+					</listitem>
+				</itemizedlist>
+			</section>
+			<section
+				id="sandbox.regexAnnotator.conceptsFile.rulesDefinition.filter">
+				<title>Match Type Filter</title>
+				<para>
+					 
+						<programlisting><![CDATA[
+<matchTypeFilter>
+  <feature name="language">en</feature>
+</matchTypeFilter>
+]]></programlisting>
+					
+
+				</para>
+				<para>
+					The match type filter construct can be used to
+					filter the match type annotations before they are
+					used for the evaluation. The
+					<code>&lt;matchTypeFilter></code>
+					element can contain one or more
+					<code>&lt;feature></code>
+					elements that contains filter information.
+				</para>
+				<para>
+					The name of the UIMA feature is specified using the
+					<code>name</code>
+					feature of the
+					<code>&lt;feature></code>
+					element. The content of the
+					<code>&lt;feature></code>
+					element contains the regular expression pattern that
+					have to match the UIMA feature value. In the example
+					above the match type annotation has a feature
+					"language" that must have the content "en". If that
+					is true, the annotation is pass the filter
+					condition.
+				</para>
+			</section>
+			<section
+				id="sandbox.regexAnnotator.conceptsFile.rulesDefinition.update">
+				<title>Update Match Type Annotation</title>
+				<para>
+					
+					<programlisting><![CDATA[
+<updateMatchTypeAnnotation>
+  <setFeature name="language" type="String">$0</setFeature>
+</updateMatchTypeAnnotation>
+]]></programlisting>
+					
+
+				</para>
+				<para>
+					With the
+					<code>&lt;updateMatchTypeAnnotation></code>
+					construct you can configure to update a UIMA feature
+					value at the match type annotation if a rule match
+					was found. The
+					<code>&lt;updateMatchTypeAnnotation></code>
+					can have one or more
+					<code>&lt;setFeature></code>
+					elements.
+				</para>
+				<para>
+					The
+					<code>&lt;setFeature></code>
+					element has the two mandatory features, these are:
+				</para>
+				<itemizedlist>
+					<listitem>
+						<para>
+							<code>name</code>
+							- Specifies the UIMA feature name that
+							should be set at the match type annotation.
+						</para>
+					</listitem>
+					<listitem>
+						<para>
+							<code>type</code>
+							- Specifies the UIMA feature type that is
+							defined in the UIMA type system. Possible
+							values are
+							<code>String</code>
+							,
+							<code>Integer</code>
+							and
+							<code>Float</code>
+						</para>
+					</listitem>
+				</itemizedlist>
+				<para>
+					The content of the
+					<code>&lt;setFeature></code>
+					element contains the value that should be set. This
+					can either be a literal value or it can be a regular
+					expression matching group as shown in the example
+					above. A combination of matching groups and literals
+					is also possible.
+				</para>
+			</section>
+			<section
+				id="sandbox.regexAnnotator.conceptsFile.rulesDefinition.exception">
+				<title>Rule exception</title>
+				<para>
+					 
+					<programlisting><![CDATA[
+<ruleExceptions>	
+  <exception matchType="uima.tcas.DocumentAnnotation">ExceptionPattern</exception>
+</ruleExceptions>
+]]></programlisting>
+					
+
+				</para>
+				<para>
+					With the
+					<code>&lt;ruleExceptions></code>
+					construct you can configure exceptions to prevent matches for the current rule. 
+					An exception is something
+					similar to a filter, but on the higher level. For
+					example take the scenario where you have several token annotations that
+					are all covered by a sentence annotation. You have written a rule that can detect
+					car brands. The text you analyze has the sentence "Henry Ford was born 1863". 
+					When analyzing the text you will get a car brand annotation since "Ford" is
+					a car brand. But is this behavior correct? The work around that issue
+					you can create an exception that looks like
+					 <programlisting><![CDATA[
+<ruleExceptions>	
+  <exception matchType="uima.SentenceAnnotation">Henry</exception>
+</ruleExceptions>
+]]></programlisting>
+					and add it to your car brand rule. After adding this, car brand annotations
+					are only created if the sentence annotation that covers the token annotation
+					does not contain the word "Henry". 					
+				</para>
+				<para>
+					The
+					<code>&lt;ruleExceptions></code>
+					element can have one or more exceptions specified with the 
+					<code>&lt;exception></code>
+					elements.
+				</para>
+				<para>
+					The
+					<code>&lt;exception></code>
+					element has one mandatory feature called
+					<code>matchType</code>. The <code>matchType</code> feature
+					specifies the annotation type the exception is based on. 
+					The exception annotation instance that is used during the runtime is evaluated for each
+					match type annotation that is used to match a rule. As
+					exception annotation instance always the covering annotation
+					of the match type annotation is searched. 
+					If no covering annotation was found the exception is not evaluated.
+				</para>
+				<para>
+					The content of the
+					<code>&lt;exception></code>
+					element specify the regular expression that is used to evaluate the exception.
+				</para>
+				<para>
+					If the exception match is true, the
+					current match type annotation is filtered out and is
+					not used to create any matches and annotations.
+				</para>
+			</section>
+		</section>
+		<section id="sandbox.regexAnnotator.conceptsFile.annotationDefinition">
+				<title>Annotation Definition</title>
+				<para>
+				  This paragraph explain with all the details how to create annotations if a rule has matched.
+				  The listing below shows the definition of an annotation with all possible settings.
+				</para>
+				<para>
+				<programlisting><![CDATA[
+<annotation id="testannot" type="org.apache.uima.TestAnnot">
+	<begin group="0" location="start"/>
+	<end group="0" location="end"/>
+	<setFeature name="testFeature1" type="String">$0</setFeature>
+	<setFeature name="testFeature2" type="Integer">$1</setFeature>
+	<setFeature name="testFeature3" type="Float">$2</setFeature>		
+	<setFeature name="testFeature4" type="Reference">testannot1</setFeature>
+	<setFeature name="confidenceValue" type="Confidence"/>
+	<setFeature name="ruleId" type="RuleId"/>
+</annotation>
+]]></programlisting>
+				</para>
+				<section id="sandbox.regexAnnotator.conceptsFile.annotationDefinition.details">
+				<title>Annotation Definition Details</title>
+				<para>
+				  The <code>&lt;annotation></code> definition has two mandatory features, these are:
+				</para>
+				<para>
+				<itemizedlist>
+					<listitem>
+						<para>
+							<code>id</code>
+							- Specifies the annotation id for this annotation. The id must be unique within the
+							concepts file.
+						</para>
+					</listitem>
+					<listitem>
+						<para>
+							<code>type</code>
+							- Specifies the UIMA annotation type that should be used if a match was found
+							to create the annotation. The used type have to be specified in the UIMA type system.
+						</para>
+					</listitem>
+				</itemizedlist>
+				</para>
+				<para>
+				  The mandatory sub elements of <code>&lt;annotation></code> are:
+				</para>
+				<para>
+				<itemizedlist>
+					<listitem>
+						<para>
+							<code>&lt;begin></code>
+							- Specifies the begin position of the annotation.
+						</para>
+					</listitem>
+					<listitem>
+						<para>
+							<code>&lt;end></code>
+							- Specifies the end position of the annotation.
+						</para>
+					</listitem>
+				</itemizedlist>
+				</para>
+				<para>
+				  The optional sub elements of <code>&lt;annotation></code> are:
+				</para>
+				<para>
+				<itemizedlist>
+					<listitem>
+						<para>
+							<code><code>&lt;setFeature></code></code>
+							- set a UIMA feature at the created annotation.
+						</para>
+					</listitem>
+				</itemizedlist>
+				</para>
+				</section>
+				<section id="sandbox.regexAnnotator.conceptsFile.annotationDefinition.boundaries">
+				<title>Annotation Boundaries</title>
+				<para>
+				  The <code>&lt;annotation></code> element defines the annotations boundaries using the
+				  sub elements <code>&lt;begin></code> and <code>&lt;end></code>. The start position of
+				  an annotation is defined using the <code>&lt;begin></code> element. The end position using
+				  the <code>&lt;end></code> element. Both elements have the same features as shown below:
+				</para>
+				<para>
+				<itemizedlist>
+					<listitem>
+						<para>
+							<code>group</code>
+							- identifies a capturing group within the regular expression pattern of the 
+							current rule. It can be assigned a single number from 0 to 9, where 0 denotes 
+							the whole match, 1 the first match group, 2 the second, and so on.
+						</para>
+					</listitem>
+					<listitem>
+						<para>
+							<code>location</code>
+							- indicates a position inside the match group, which can either be the position 
+							of the left parenthesis in case of a value “start”, or the right parenthesis in 
+							case of a value “end”. The <code>location</code> feature is optional. By default
+							the <code>&lt;begin></code> element set <code>location="start"</code> and the 
+							<code>&lt;end></code> element <code>location="end"</code>.
+						</para>
+					</listitem>
+				</itemizedlist>
+				</para>
+				</section>
+				<section id="sandbox.regexAnnotator.conceptsFile.annotationDefinition.features">
+				<title>Annotation Features</title>
+				<para>
+				  With the <code>&lt;setFeature></code> element of <code>&lt;annotation></code> it is 
+				  possible to set UIMA features at the created annotation. The mandatory features
+				  that must be set are: 
+				</para>
+				<para>
+				<itemizedlist>
+					<listitem>
+						<para>
+							<code>name</code>
+							- specifies the UIMA feature name that should be set.
+						</para>
+					</listitem>
+					<listitem>
+						<para>
+							<code>type</code>
+							- specifies the type of the UIMA feature. For a list of all
+							possible type values please refer to the feature types section below.
+						</para>
+					</listitem>
+				</itemizedlist>
+				</para>
+				<para>
+				  The content of the <code>&lt;setFeature></code> element specifies the value of the
+				  UIMA feature that is set. As value a literal, a capturing group or a combination of
+				  both can be specified.
+				</para>
+				<section id="sandbox.regexAnnotator.conceptsFile.annotationDefinition.featureTypes">
+				<title>Features types</title>
+				<para>
+				  The <code>&lt;setFeature></code> element has a feature called <code>type</code> 
+				  to specify the UIMA feature type. The possible feature types are listed below: 
+				</para>
+				<para>
+				<itemizedlist>
+					<listitem>
+						<para>
+							<code>String</code>
+							- for <code>uima.cas.String</code> based UIMA features.
+						</para>
+					</listitem>
+					<listitem>
+						<para>
+							<code>Integer</code>
+							- for <code>uima.cas.Integer</code> based UIMA features.
+						</para>
+					</listitem>
+					<listitem>
+						<para>
+							<code>Float</code>
+							- for <code>uima.cas.Float</code> based UIMA features.
+						</para>
+					</listitem>
+					<listitem>
+						<para>
+							<code>Reference</code>
+							- to link a UIMA feature to another annotation. In this case the
+							UIMA feature type have to be the same as the referred annotation type.
+							To reference another annotation the <code>&lt;setFeature></code>
+							content have to contain the annotation id of the referred annotation.
+						</para>
+					</listitem>
+					<listitem>
+						<para>
+							<code>Confidence</code>
+							- add if available the value of the <code>confidence</code> feature defined
+							at the <code>&lt;rule></code> element to this feature. The UIMA feature have to
+							be of type <code>uima.cas.Float</code>.
+						</para>
+					</listitem>
+					<listitem>
+						<para>
+							<code>RuleId</code>
+							- add if available the value of the <code>ruleId</code> feature defined
+							at the <code>&lt;rule></code> element to this feature. The UIMA feature have to
+							be of type <code>uima.cas.String</code>.
+						</para>
+					</listitem>					
+				</itemizedlist>
+				</para>
+
+				</section>
+			</section>			
+		</section>
+		<section id="sandbox.regexAnnotator.annotatorDescriptor">
+			<title>Annotator Descriptor</title>
+			<para>The RegexAnnotator analysis engine descriptor contains some processing information about 
+			the annotator. These processing information are specified as parameters and external resource dependencies. 
+			In this chapter we will look in detail at the descriptor settings.
+			</para>
+			<section id="sandbox.regexAnnotator.annotatorDescriptor.configParam">
+				<title>Configuration Parameters</title>
+				<para>
+				  The RegexAnnotator has the following configuration parameters that can affect the processing: 
+				</para>
+				<para>
+					<itemizedlist>
+						<listitem>
+							<para>
+								<code>ProcessAllConceptRules</code>
+								- If this parameter is set to true, all rules of a concept are processed. 
+								If this parameter is set to false, the rules are processed by confidence 
+								(highest confidence value first) and the processing stops after the first 
+								rule where matches are available.
+							</para>
+						</listitem>
+				  	</itemizedlist>
+				</para>
+			</section>
+			<section id="sandbox.regexAnnotator.annotatorDescriptor.externalResource">
+				<title>External Resources</title>
+				<para>
+				  To specify the concept file that contains all the concepts and rules the 
+				  RegexAnnotator should process an external resource binding is used. 
+				  The important section in the descriptor where the external resource
+				  is specified is shown below.
+				</para>
+				<para>
+				<programlisting><![CDATA[
+<externalResources>
+  <externalResource>
+    <name>RegexConceptsFile</name>
+    <description>Regex Concepts file</description>
+    <fileResourceSpecifier>
+      <fileUrl>file:concepts.xml</fileUrl>
+    </fileResourceSpecifier>
+    <implementationName>org.apache.uima.annotator.regex.impl.FileResource_impl</implementationName>
+  </externalResource>
+</externalResources>
+]]></programlisting>
+				</para>
+				<para>
+				  The <code>&lt;fileUrl></code> element contains the file URL of the concept file.
+				  The given URL have to be available in the UIMA datapath or in the classpath. 
+				</para>
+				
+			</section>
+			<section id="sandbox.regexAnnotator.annotatorDescriptor.capabilities">
+				<title>Capabilities</title>
+				<para>
+				  In the capabilities section of the RegexAnnotator descriptor the input and output 
+				  capabilities and the supported languages have to be defined. 
+				</para>
+				<para>
+				  The input capabilities defined
+				  in the descriptor have to comply with the match types used in the concept rule file 
+				  that is used. For example the <code>uima.SentenceAnnotation</code> use in the rule
+				  below must be added to the input capability section in the RegexAnnotator descriptor.
+				</para>
+				<para>
+				<programlisting><![CDATA[
+<rules>
+  <rule regEx="RestRegex" matchStrategy="matchAll" matchType="uima.SentenceAnnotation"/>
+</rules>
+]]></programlisting>
+				</para>
+				<para>
+				  In the output section, all of the annotation types and features created by 
+				  the RegexAnnotator have to be specified. These have to match the 
+				  output types and features declared in the <code>&lt;annotation></code> elements of the concept file.
+				  For example the <code>org.apache.uima.TestAnnot</code> annotation and the 
+				  <code>org.apache.uima.TestAnnot:testFeature</code> feature used below must
+				  be added to the output capability section in the RegexAnnotator descriptor. 
+				</para>
+				<para>
+				<programlisting><![CDATA[
+<createAnnotations>
+  <annotation id="testannotation" type="org.apache.uima.TestAnnot">
+    <begin group="0"/>
+    <end group="0"/>
+    <setFeature name="testFeature" type="String">$0</setFeature>
+  </annotation>
+</createAnnotations>
+]]></programlisting>
+				</para>
+				<para>
+				  If there are any language dependent rules in the concept file the supported languages abbreviations 
+				  have to be specified in the <code>&lt;languagesSupported></code>element. If there are no 
+				  language dependent rules available you can specify <code>x-unspecified</code> as language. That means
+				  that the annotator can work on all languages.   
+				</para>
+				<para>
+				  For the short examples used above the capabilities section in the RegexAnnotator 
+				  descriptor looks like:
+				</para>
+				<para>
+				<programlisting><![CDATA[
+<capabilities>
+  <capability>
+    <inputs>
+      <type>uima.SentenceAnnotation</type>
+    </inputs>
+    <outputs>
+      <type>org.apache.uima.TestAnnot</type>
+      <feature>org.apache.uima.TestAnnot:testFeature</feature>
+    </outputs>
+    <languagesSupported>
+      <language>x-unspecified</language>
+    </languagesSupported>
+  </capability>
+</capabilities>
+]]></programlisting>
+				</para>
+			</section>
+		</section>
+		<section id="sandbox.regexAnnotator.xsd">
+			<title>Concept File Schema</title>
+			<para>The concept file schema looks like:
+			</para>
+			<para>
+				<programlisting><![CDATA[
+<?xml version="1.0" encoding="UTF-8"?>
+<xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema">
+	<!--
+		* Licensed to the Apache Software Foundation (ASF) under one
+		* or more contributor license agreements.  See the NOTICE file
+		* distributed with this work for additional information
+		* regarding copyright ownership.  The ASF licenses this file
+		* to you under the Apache License, Version 2.0 (the
+		* "License"); you may not use this file except in compliance
+		* with the License.  You may obtain a copy of the License at
+		* 
+		*   http://www.apache.org/licenses/LICENSE-2.0
+		* 
+		* Unless required by applicable law or agreed to in writing,
+		* software distributed under the License is distributed on an
+		* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+		* KIND, either express or implied.  See the License for the
+		* specific language governing permissions and limitations
+		* under the License.
+	-->
+
+  <xs:element name="conceptSet">
+	<xs:complexType>
+	  <xs:sequence>
+		<xs:element ref="concept" minOccurs="1"	maxOccurs="unbounded" />
+	  </xs:sequence>
+	</xs:complexType>
+  </xs:element>
+
+  <xs:element name="concept">
+	<xs:complexType>
+	  <xs:sequence>
+		<xs:element ref="rules" minOccurs="1" maxOccurs="1" />
+		<xs:element ref="createAnnotations" minOccurs="1" maxOccurs="1" />
+	  </xs:sequence>
+	  <xs:attribute name="name" type="xs:ID" use="optional" />
+	</xs:complexType>
+  </xs:element>
+
+  <xs:element name="createAnnotations">
+	<xs:complexType>
+	  <xs:sequence>
+		<xs:element ref="annotation" minOccurs="1" maxOccurs="unbounded" />
+	  </xs:sequence>
+	</xs:complexType>
+  </xs:element>
+
+  <xs:element name="rules">
+	<xs:complexType>
+	  <xs:sequence>
+		<xs:element ref="rule" minOccurs="1" maxOccurs="unbounded" />
+	  </xs:sequence>
+	</xs:complexType>
+  </xs:element>
+
+  <xs:element name="rule">
+	<xs:complexType>
+	  <xs:all>
+		<xs:element ref="matchTypeFilter" minOccurs="0"	maxOccurs="1" />
+		<xs:element ref="updateMatchTypeAnnotation" minOccurs="0" maxOccurs="1" />
+		<xs:element ref="ruleExceptions" minOccurs="0" maxOccurs="1" />
+	  </xs:all>
+	  <xs:attribute name="regEx" type="xs:string" use="required" />
+	  <xs:attribute name="matchStrategy" use="required">
+	    <xs:simpleType>
+		  <xs:restriction base="xs:string">
+		    <xs:enumeration value="matchFirst" />
+			<xs:enumeration value="matchAll" />
+			<xs:enumeration value="matchComplete" />
+		  </xs:restriction>
+		</xs:simpleType>
+	  </xs:attribute>
+	  <xs:attribute name="matchType" type="xs:string" use="required" />
+	  <xs:attribute name="ruleId" type="xs:ID" use="optional" />
+	  <xs:attribute name="confidence" type="xs:decimal"	use="optional" />
+	</xs:complexType>
+  </xs:element>
+
+  <xs:element name="matchTypeFilter">
+	<xs:complexType>
+	  <xs:sequence>
+		<xs:element ref="feature" minOccurs="0"	maxOccurs="unbounded" />
+	  </xs:sequence>
+	</xs:complexType>
+  </xs:element>
+
+  <xs:element name="ruleExceptions">
+	<xs:complexType>
+	  <xs:sequence>
+	    <xs:element ref="exception" minOccurs="0" maxOccurs="unbounded" />
+	  </xs:sequence>
+	</xs:complexType>
+  </xs:element>
+
+  <xs:element name="exception">
+	<xs:complexType>
+	  <xs:simpleContent>
+		<xs:extension base="xs:string">
+		  <xs:attribute name="matchType" type="xs:string" use="required" />
+		</xs:extension>
+	  </xs:simpleContent>
+	</xs:complexType>
+  </xs:element>
+
+  <xs:element name="feature">
+	<xs:complexType>
+	  <xs:simpleContent>
+		<xs:extension base="xs:string">
+		  <xs:attribute name="name" type="xs:string" use="required" />
+		</xs:extension>
+	  </xs:simpleContent>
+	</xs:complexType>
+  </xs:element>
+
+  <xs:element name="annotation">
+	<xs:complexType>
+	  <xs:sequence>
+		<xs:element ref="begin" minOccurs="1" maxOccurs="1" />
+		<xs:element ref="end" minOccurs="1" maxOccurs="1" />
+		<xs:element ref="setFeature" minOccurs="0" maxOccurs="unbounded" />
+	  </xs:sequence>
+	  <xs:attribute name="id" type="xs:ID" use="required" />
+	  <xs:attribute name="type" type="xs:string" use="required" />
+	</xs:complexType>
+  </xs:element>
+
+  <xs:element name="updateMatchTypeAnnotation">
+	<xs:complexType>
+	  <xs:sequence>
+	    <xs:element ref="setFeature" minOccurs="0" maxOccurs="unbounded" />
+	  </xs:sequence>
+	</xs:complexType>
+  </xs:element>
+
+  <xs:element name="begin">
+	<xs:complexType>
+	  <xs:attribute name="group" use="required">
+	    <xs:simpleType>
+		  <xs:restriction base="xs:integer">
+		    <xs:minInclusive value="0" />
+			<xs:maxInclusive value="9" />
+		  </xs:restriction>
+		</xs:simpleType>
+	  </xs:attribute>
+	  <xs:attribute name="location" use="optional" default="start">
+	    <xs:simpleType>
+	      <xs:restriction base="xs:string">
+		    <xs:enumeration value="start" />
+		    <xs:enumeration value="end" />
+		  </xs:restriction>
+	    </xs:simpleType>
+	  </xs:attribute>
+	</xs:complexType>
+  </xs:element>
+
+  <xs:element name="end">
+	<xs:complexType>
+	  <xs:attribute name="group" use="required">
+		<xs:simpleType>
+		  <xs:restriction base="xs:integer">
+			<xs:minInclusive value="0" />
+			<xs:maxInclusive value="9" />
+		  </xs:restriction>
+		</xs:simpleType>
+	  </xs:attribute>
+	  <xs:attribute name="location" use="optional" default="end">
+		<xs:simpleType>
+		  <xs:restriction base="xs:string">
+		    <xs:enumeration value="start" />
+			<xs:enumeration value="end" />
+		  </xs:restriction>
+		</xs:simpleType>
+	  </xs:attribute>
+	</xs:complexType>
+  </xs:element>
+
+  <xs:element name="setFeature">
+	<xs:complexType>
+	  <xs:simpleContent>
+		<xs:extension base="xs:string">
+		  <xs:attribute name="name" type="xs:string" use="required" />
+		  <xs:attribute name="type" use="required">
+		    <xs:simpleType>
+			  <xs:restriction base="xs:string">
+			    <xs:enumeration value="String" />
+				<xs:enumeration value="Integer" />
+				<xs:enumeration value="Float" />
+				<xs:enumeration value="Reference" />
+				<xs:enumeration value="Confidence" />
+				<xs:enumeration value="RuleId" />
+			  </xs:restriction>
+			</xs:simpleType>
+		  </xs:attribute>
+		</xs:extension>
+	  </xs:simpleContent>
+	</xs:complexType>
+  </xs:element>
+</xs:schema>
+]]></programlisting>
+			  
+			</para>
+		</section>
+	</section>
+
+</chapter>
+
+</book>
\ No newline at end of file