You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@uima.apache.org by pk...@apache.org on 2012/07/20 14:27:15 UTC
svn commit: r1363750 [2/3] - in /uima/sandbox/trunk/TextMarker/uima-docbook-textmarker: ./ src/docbook/

Added: uima/sandbox/trunk/TextMarker/uima-docbook-textmarker/src/docbook/tools.textmarker.language.xml
URL: http://svn.apache.org/viewvc/uima/sandbox/trunk/TextMarker/uima-docbook-textmarker/src/docbook/tools.textmarker.language.xml?rev=1363750&view=auto
==============================================================================
--- uima/sandbox/trunk/TextMarker/uima-docbook-textmarker/src/docbook/tools.textmarker.language.xml (added)
+++ uima/sandbox/trunk/TextMarker/uima-docbook-textmarker/src/docbook/tools.textmarker.language.xml Fri Jul 20 12:27:14 2012
@@ -0,0 +1,626 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE chapter PUBLIC "-//OASIS//DTD DocBook XML V4.4//EN"
+"http://www.oasis-open.org/docbook/xml/4.4/docbookx.dtd"[
+<!ENTITY imgroot "images/tools/tools.textmarker/" >
+<!ENTITY % uimaents SYSTEM "../../target/docbook-shared/entities.ent" >  
+%uimaents;
+]>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
+	license agreements. See the NOTICE file distributed with this work for additional 
+	information regarding copyright ownership. The ASF licenses this file to 
+	you under the Apache License, Version 2.0 (the "License"); you may not use 
+	this file except in compliance with the License. You may obtain a copy of 
+	the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
+	by applicable law or agreed to in writing, software distributed under the 
+	License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
+	OF ANY KIND, either express or implied. See the License for the specific 
+	language governing permissions and limitations under the License. -->
+
+<chapter id="ugr.tools.tm.language.language">
+	<title>TextMarker Language</title>
+	<para>
+
+	</para>
+
+	<section id="ugr.tools.tm.language.seeding">
+		<title>Basic Annotations and tokens</title>
+		<para>
+			The TextMarker system uses a JFlex lexer to initially create a
+			seed of
+			basic, token annotations.
+		</para>
+	</section>
+	<section id="ugr.tools.tm.language.syntax">
+		<title>Syntax</title>
+		<para>
+			Structure
+			<programlisting><![CDATA[
+            script                 -> packageDeclaration globalStatements statements
+            packageDeclaration     -> "PACKAGE" DottedIdentifier ";"
+            globalStatments        -> globalStatment*   
+            globalStatment         -> ("TYPESYSTEM" | "SCRIPT" | "ENGINE") DottedIdentifier ";"
+            statements             -> statement*
+            statement              -> typeDeclaration | resourceDeclaration | variableDeclaration 
+                                      | blockDeclaration | simpleStatement
+            ]]></programlisting>
+
+			Declarations
+			<programlisting><![CDATA[
+				typeDeclaration -> "DECLARE" (AnnotationType)? Identifier ("," Identifier )*
+				| "DECLARE" AnnotationType Identifier ( "(" featureDeclaration ")" )?
+				featureDeclaration -> ( (AnnotationType | "STRING" | "INT" |
+				"DOUBLE" | "BOOLEAN") Identifier)+
+				resourceDeclaration -> ("WORDLIST" Identifier = listExpression | "WORDTABLE" Identifier
+				= tableExpression) ";"
+				variableDeclaration -> ("TYPE" | "STRING" | "INT" | "DOUBLE" | "BOOLEAN") Identifier
+				";"
+				]]>
+			</programlisting>
+			More information about Declarations.
+
+			Statements
+			<programlisting><![CDATA[
+            blockDeclaration       -> "BLOCK" "(" Identifier ")" ruleElementWithType "{" statements "}"
+            simpleStatement        -> ruleElements ";"
+            ruleElements           -> ( ruleElementWithLiteral  | ruleElementWithType )+
+            ruleElementWithLiteral -> simpleStringExpression quantifierPart? conditionActionPart?
+            ruleElementWithType    -> typeExpression quantifierPart? conditionActionPart?
+            quantifierPart         -> "*" | "*?" | "+" | "+?" | "?" | "??" 
+                                      | "[" numberExpression "," numberExpression "]"
+                                      | "[" numberExpression "," numberExpression "]?"
+                                      
+            conditionActionPart    -> "{" (condition ( "," condition )*)? ( "->" (action( "," action)*))? "}"        
+            condition              -> ConditionName ("(" argument ("," argument)* ")")?
+            action                 -> ActionName ("(" argument ("," argument)* ")")?
+            ]]></programlisting>
+			More information about Quantifiers,
+			Conditions, Actions and Blocks.
+			The ruleElementWithType of a BLOCK declaration must have opening
+			and
+			closing curly brackets (e.g., BLOCK(name) Document{} {...})
+
+			Expressions
+			<programlisting><![CDATA[
+            argument                   -> typeExpression | numberExpression | stringExpression | booleanExpression
+            typeExpression             -> AnnotationType | TypeVariable
+            numberExpression           -> additiveExpression
+            additiveExpression         -> multiplicativeExpression
+            multiplicativeExpression   -> simpleNumberExpression ( ( "*" | "/" | "%" ) simpleNumberExpression )*
+                                          | ( "EXP" | "LOGN" | "SIN" | "COS" | "TAN" ) numberExpressionInPar
+            numberExpressionInPar      -> "(" additiveExpression ")"
+            simpleNumberExpression     -> "-"? ( DecimalLiteral | FloatingPointLiteral | NumberVariable)
+                                          | numberExpressionInPar      
+            stringExpression           -> simpleStringExpression ( "+" simpleSEOrNE )*                   
+            simpleStringExpression     -> StringLiteral | StringVariable
+            simpleSEOrNE               -> simpleStringExpression | numberExpressionInPar
+            booleanExpression          -> booleanNumberExpression | BooleanVariable | BooleanLiteral
+            booleanNumberExpression    -> "(" numberExpression ( "<" | "<=" | ">" | ">=" | "==" | "!=" ) numberExpression ")"
+            listExpression             -> Identifier | ResourceLiteral
+            tableExpression            -> Identifier | ResourceLiteral
+            ]]></programlisting>
+			More information about Expressions. A ResourceLiteral
+			is something
+			like 'folder/file.txt' (yes, with single quotes).
+		</para>
+	</section>
+	<section id="ugr.tools.tm.language.inference">
+		<title>Syntax</title>
+		<para>
+			The inference relies on a complete, disjunctive partition of the
+			document. A basic (minimal) annotation for each element of the
+			partition is assigned to a type of a hierarchy. These basic
+			annotations are enriched for performance reasons with information
+			about annotations that start at the same offset or overlap with the
+			basic annotation. Normally, a scanner creates a basic annotation for
+			each token, punctuation or whitespace, but can also be replaced with
+			a different annotation seeding strategy. Unlike other rule-based
+			information extraction language, the rules are executed in an
+			imperative way. Experience has shown that the dependencies between
+			rules, e.g., the same annotation types in the action and in the
+			condition of a different rule, often form tree-like and not
+			graph-like structures. Therefore, the sequencing and imperative
+			processing did not cause disadvantages, but instead obvious
+			advantages, e.g., the improved understandability of large rule sets.
+			The following algorithm summarizes the rule inference:
+			<programlisting><![CDATA[
+collect all basic annotations that fulfill the first matching condition
+  for all collected basic annotations do
+    for all rule elements of current rule do
+    if quantifier wants to match then
+      match the conditions of the rule element on the current basic annotation
+      determine the next basic annotation after the current match
+      if quantifier wants to continue then
+        if there is a next basic annotation then
+          continue with the current rule element and the next basic annotation
+        else if rule element did not match then
+          reset the next basic annotation to the current one
+      set the current basic annotation to the next one
+      if some rule elements did not match then
+        stop and continue with the next collected basic annotation
+      else if there is no current basic annotation and the quantifier wants to continue then
+        set the current basic annotation to the previous one
+  if all rule elements matched then
+    execute the actions of all rule elements
+]]></programlisting>
+			The rule elements can of course match on all kinds of annotations.
+			Therefore the determination of the next basic annotation returns the
+			first basic annotation after the last basic annotation of the
+			complete, matched annotation.
+
+		</para>
+	</section>
+	<section id="ugr.tools.tm.language.declarations">
+		<title>Declarations</title>
+		<para>
+
+			There are three different kinds declaration in the TextMarker
+			system:
+			Declarations of types with optional feature definitions of
+			that type,
+			declaration of variables and declarations for importing
+			external
+			resources, scripts of UIMA components.
+		</para>
+		<section id="ugr.tools.tm.language.declarations.type">
+			<title>Type</title>
+			<para>
+				Type declarations define new kinds of annotations types and
+				optionally its features.
+
+				Examples:
+				<programlisting><![CDATA[
+            DECLARE SimpleType1, SimpleType2; // <- two new types with the parent type "Annotation"
+            DECLARE ParentType NewType (SomeType feature1, INT feature2); // <- defines a new type "NewType" 
+                // with parent type "ParentType" and two features
+            ]]></programlisting>
+
+				If the parent type is not defined in the same namepace, then the
+				complete namespace has to be used, e.g., DECLARE
+				my.other.package.Parent NewType;
+			</para>
+		</section>
+		<section id="ugr.tools.tm.language.declarations.variable">
+			<title>Variable</title>
+			<para>
+				Variable declarations define new variables. There are five kinds of
+				variables:
+				* Type variable: A variable that represents an annotation
+				type.
+				* Integer variable: A variable that represents a integer.
+				*
+				Double variable: A variable that represents a floating-point
+				number.
+				* String variable: A variable that represents a string.
+				*
+				Boolean
+				variable: A variable that represents a boolean.
+
+				Examples:
+				<programlisting><![CDATA[
+                TYPE newTypeVariable;
+                INT newIntegerVariable;
+                DOUBLE newDoubleVariable;
+                STRING newStringVariable;
+                BOOLEAN newBooleanVariable;
+                ]]></programlisting>
+
+			</para>
+		</section>
+		<section id="ugr.tools.tm.language.declarations.ressource">
+			<title>Resources</title>
+			<para>
+
+				There are two kinds of resource declaration, that make external
+				resources available in hte TextMarker system:
+				* List: A list
+				represents a normal text file with an entry per line
+				or a compiled
+				tree of a word list.
+				* Table: A table represents comma separated
+				file.
+
+				Examples:
+				<programlisting><![CDATA[
+                LIST Name = 'someWordList.txt';
+                TABLE Name = 'someTable.csv';
+                ]]></programlisting>
+
+			</para>
+		</section>
+		<section id="ugr.tools.tm.language.declarations.scripts">
+			<title>Scripts</title>
+			<para>
+
+				Additional scripts can be imported and reused with the CALL action.
+				The types of the imported rules are then also available, so that it
+				is not neccessary to import the Type System of the additional rule
+				script.
+
+				Examples:
+				<programlisting><![CDATA[
+                SCRIPT my.package.AnotherScript; // <- "AnotherScript.tm" in the "my.package" package
+                Document{->CALL(AnotherScript)}; // <- rule executes "AnotherScript.tm"
+                ]]></programlisting>
+
+			</para>
+		</section>
+		<section id="ugr.tools.tm.language.declarations.components">
+			<title>Components</title>
+			<para>
+
+				There are two kind of UIMA components that can be imported in a
+				TextMarker script:
+				* Type System: includes the types defined in an
+				external type system.
+				* Analysis Engine: makes an external analysis
+				engine available. The
+				type system needed for the analysis engine has
+				to be imported
+				seperately. Please mind the filtering setting when
+				calling an
+				external analysis engine.
+
+				Examples:
+				<programlisting><![CDATA[
+                ENINGE my.package.ExternalEngine; // <- "ExternalEngine.xml" in the 
+                    // "my.package" package (in the descriptor folder)
+                TYPESYSTEM my.package.ExternalTypeSystem; // <- "ExternalTypeSystem.xml" 
+                    // in the "my.package" package (in the descriptor folder)
+                Document{->RETAINTYPE(SPACE,BREAK),CALL(ExternalEngine)}; 
+                    // calls ExternalEngine, but retains white spaces
+                ]]></programlisting>
+
+			</para>
+		</section>
+	</section>
+	<section id="ugr.tools.tm.language.quantifier">
+		<title>Quantifiers</title>
+		<para>
+		</para>
+		<section id="ugr.tools.tm.language.quantifier.sg">
+			<title>* Star Greedy</title>
+			<para>
+				The Star Greedy quantifier matches on any amount of annotations and
+				evaluates always true. Please mind, that a rule element with a Star
+				Greedy quantifier needs to match on different annotations than the
+				next rule element.
+
+				Examples:
+				<programlisting><![CDATA[
+                Input:    small Big Big Big small
+                Rule:     CW*
+                Matched:  Big Big Big  
+                Matched:  Big Big 
+                Matched:  Big
+                ]]></programlisting>
+
+			</para>
+		</section>
+		<section id="ugr.tools.tm.language.quantifier.sr">
+			<title>*? Star Reluctant</title>
+			<para>
+				The Star Reluctant quantifier matches on any amount of annotations
+				and evaluates always true, but stops to match on new annotations,
+				when the next rule element matches and evaluates true on this
+				annotation.
+
+				Examples:
+				<programlisting><![CDATA[
+                Input:    123 456 small small Big 
+                Rule:     W*? CW
+                Matched:  small small Big
+                Matched:  small Big
+                Matched:  Big
+                ]]></programlisting>
+
+			</para>
+		</section>
+		<section id="ugr.tools.tm.language.quantifier.pg">
+			<title>+ Plus Greedy</title>
+			<para>
+				The Plus Greedy quantifier needs to match on at least one
+				annotation. Please mind, that a rule element after a rule element
+				with a Plus Greedy quantifier matches and evaluates on different
+				conditions.
+
+				Examples:
+
+				<programlisting><![CDATA[
+                Input:    123 456 small small Big 
+                Rule:     SW+ 
+                Matched:  small small
+                Matched:  small 
+                ]]></programlisting>
+
+			</para>
+		</section>
+		<section id="ugr.tools.tm.language.quantifier.pr">
+			<title>+? Plus Reluctant</title>
+			<para>
+				The Plus Reluctant quantifier has to match on at least one
+				annotation in order to evaluate true, but stops when the next rule
+				element is able to match on this annotation.
+
+				Examples:
+				<programlisting><![CDATA[
+                Input:    123 456 small small Big 
+                Rule:     W+? CW
+                Matched:  small small Big
+                ]]></programlisting>
+
+			</para>
+		</section>
+		<section id="ugr.tools.tm.language.quantifier.qg">
+			<title>? Question Greedy</title>
+			<para>
+				The Question Greedy quantifier matches optionally on an annotation
+				and therefore always evaluates true.
+
+				Examples:
+				<programlisting><![CDATA[
+                Input:    123 456 small Big small Big 
+                Rule:     SW CW? SW
+                Matched:  small Big small
+                ]]></programlisting>
+
+			</para>
+		</section>
+		<section id="ugr.tools.tm.language.quantifier.qr">
+			<title>?? Question Reluctant</title>
+			<para>
+				The Question Reluctant quantifier matches optionally on an
+				annotation if the next rule element can not match on the same
+				annotation and therefore always evaluates true.
+
+				Examples:
+				<programlisting><![CDATA[
+                Input:    123 456 small Big small Big 
+                Rule:     SW CW?? SW
+                Matched:  small Big small
+                ]]></programlisting>
+
+			</para>
+		</section>
+		<section id="ugr.tools.tm.language.quantifier.mmg">
+			<title>[x,y] Min Max Greedy</title>
+			<para>
+				The Min Max Greedy quantifier has to match at least x and at most y
+				annotations of its rule element to elaluate true.
+
+				Examples:
+				<programlisting><![CDATA[
+                Input:    123 456 small Big small Big 
+                Rule:     SW CW[1,2] SW
+                Matched:  small Big small
+                ]]></programlisting>
+
+			</para>
+		</section>
+		<section id="ugr.tools.tm.language.quantifier.mmr">
+			<title>[x,y]? Min Max Reluctant</title>
+			<para>
+				The Min Max Greedy quantifier has to match at least x and at most y
+				annotations of its rule element to elaluate true, but stops to
+				match
+				on additional annotations if the next rule element is able to
+				match
+				on this annotation.
+
+				Examples:
+				<programlisting><![CDATA[
+                Input:    123 456 small Big Big Big small Big 
+                Rule:     SW CW[2,100]? SW
+                Matched:  small Big Big Big small
+                ]]></programlisting>
+			</para>
+		</section>
+	</section>
+
+	
+	<xi:include xmlns:xi="http://www.w3.org/2001/XInclude" href="tools.textmarker.conditions.xml"/>
+	<xi:include xmlns:xi="http://www.w3.org/2001/XInclude" href="tools.textmarker.actions.xml"/>
+	
+	<section id="ugr.tools.tm.language.expressions">
+		<title>Expressions</title>
+		<para>
+		</para>
+		<section id="ugr.tools.tm.language.expressions.type">
+			<title>Type Expressions</title>
+			<para>
+			</para>
+		</section>
+		<section id="ugr.tools.tm.language.expressions.numer">
+			<title>Number Expressions</title>
+			<para>
+			</para>
+		</section>
+		<section id="ugr.tools.tm.language.expressions.string">
+			<title>String Expressions</title>
+			<para>
+			</para>
+		</section>
+		<section id="ugr.tools.tm.language.expressions.boolean">
+			<title>Boolean Expressions</title>
+			<para>
+			</para>
+		</section>
+	</section>
+	<section id="ugr.tools.tm.language.filtering">
+		<title>Robust extraction using filtering</title>
+		<para>
+			Rule based or pattern based information extraction systems often
+			suffer from unimportant fill words, additional whitespace and
+			unexpected markup. The TextMarker System enables the knowledge
+			engineer to filter and to hide all possible combinations of
+			predefined and new types of annotations. Additionally, it can
+			differentiate between every kind of HTML markup and XML tags. The
+			visibility of tokens and annotations is modified by the actions of
+			rule elements and can be conditioned using the complete
+			expressiveness of the language. Therefore the TextMarker system
+			supports a robust approach to information extraction and simplifies
+			the creation of new rules since the knowledge engineer can focus on
+			important textual features. If no rule action changed the
+			configuration of the filtering settings, then the default filtering
+			configuration ignores whitespaces and markup. Using the default
+			setting, the following rule matches all four types of input in this
+			example:
+			<programlisting><![CDATA[
+"Dr" PERIOD CW CW
+]]></programlisting>
+			<programlisting><![CDATA[
+Dr. Peter Steinmetz
+Dr . Peter      Steinmetz
+Dr. <b><i>Peter</i> Steinmetz</b>
+Dr.PeterSteinmetz
+]]></programlisting>
+		</para>
+	</section>
+	<section id="ugr.tools.tm.language.blocks">
+		<title>Blocks</title>
+		<para>
+			Blocks combine some more complex control structures in the
+			TextMarker
+			language: conditioned statement, loops and procedures.
+
+
+			The
+			rule
+			element
+			in the definition of a block has to define a
+			condition/action
+			part,
+			even if that part is empty (LCURLY and
+			RCULRY).
+
+
+			A block can use
+			normal
+			conditions to condition the execution
+			of its
+			containing rules.
+
+			Examples:
+
+			<programlisting><![CDATA[
+DECLARE Month;
+
+BLOCK(EnglishDates) Document{FEATURE("language", "en")} {
+    Document{->MARKFAST(Month,'englishMonthNames.txt')};
+    //...
+}
+
+BLOCK(GermanDates) Document{FEATURE("language", "de")} {
+    Document{->MARKFAST(Month,'germanMonthNames.txt')};
+    //...
+}
+]]></programlisting>
+
+
+			A block can be used to execute the containing rule on a sequence of
+			similar text passages.
+
+			Examples:
+			<programlisting><![CDATA[
+BLOCK(Paragraphs) Paragraphs{} { // <- limit the local view on the document: defines a local document
+    // This rule will be executed for each Paragraph that can be found in the current document.
+    Document{CONTAINS(Keyword)->MARK(SpecialParagraph)}; 
+    // Here, Document represents not the complete input document, but each Paragraph defined by the block statement.
+}
+]]></programlisting>
+		</para>
+	</section>
+	<section id="ugr.tools.tm.language.score">
+		<title>Heuristic extraction using scoring rules</title>
+		<para>
+			Diagnostic scores are a well known and successfully applied
+			knowledge
+			formalization pattern for diagnostic problems. Single known
+			findings
+			valuate a possible solution by adding or subtracting points
+			on an
+			account of that solution. If the sum exceeds a given threshold,
+			then
+			the solution is derived. One of the advantages of this pattern
+			is the
+			robustness against missing or false findings, since a high
+			number of
+			findings is used to derive a solution.
+
+			The TextMarker system
+			tries to
+			transfer this diagnostic problem
+			solution
+			strategy to the
+			information
+			extraction problem. In addition to a
+			normal creation of a
+			new
+			annotation, a MARK action can add positive
+			or negative scoring
+			points
+			to the text fragments matched by the rule
+			elements. If the
+			amount of
+			points exceeds the defined threshold for
+			the respective
+			type, then a
+			new annotation will be created. Further,
+			the current
+			value of heuristic
+			points of a possible annotation can
+			be
+			evaluated by
+			the SCORE condition.
+			In the following, the heuristic
+			extraction using
+			scoring rules is
+			demonstrated by a short example:
+
+			<programlisting><![CDATA[
+            Paragraph{CONTAINS(W,1,5)->MARKSCORE(5,Headline)};
+            Paragraph{CONTAINS(W,6,10)->MARKSCORE(2,Headline)};
+            Paragraph{CONTAINS(Emph,80,100,true)->MARKSCORE(7,Headline)};
+            Paragraph{CONTAINS(Emph,30,80,true)->MARKSCORE(3,Headline)};
+            Paragraph{CONTAINS(CW,50,100,true)->MARKSCORE(7,Headline)};
+            Paragraph{CONTAINS(W,0,0)->MARKSCORE(-50,Headline)};
+            Headline{SCORE(10)->MARK(Realhl)};
+            Headline{SCORE(5,10)->LOG("Maybe a headline")};
+                ]]></programlisting>
+
+
+			In the first part of this rule set, annotations of the type
+			paragraph
+			receive scoring points for a headline annotation, if they
+			fulfill
+			certain CONTAINS conditions. The first condition, for
+			example,
+			evaluates to true, if the paragraph contains one word up to
+			five
+			words, whereas the fourth conditions is fulfilled, if the
+			paragraph
+			contains thirty up to eighty percent of emph annotations.
+			The last
+			two
+			rules finally execute their actions, if the score of a
+			headline
+			annotation exceeds ten points, or lies in the interval of
+			five and
+			ten
+			points, respectively.
+		</para>
+	</section>
+	<section id="ugr.tools.tm.language.modification">
+		<title>Modification</title>
+		<para>
+			There are different actions that can modify the input document,
+			like DEL,
+			COLOR and REPLACE. But the input document itself can not be
+			modified
+			directly. A seperate engine, the Modifier.xml, has to be
+			called in
+			order to create another cas view with the name "modified".
+			In that
+			document all modifications are executed.
+		</para>
+	</section>
+</chapter>
\ No newline at end of file

Added: uima/sandbox/trunk/TextMarker/uima-docbook-textmarker/src/docbook/tools.textmarker.overview.xml
URL: http://svn.apache.org/viewvc/uima/sandbox/trunk/TextMarker/uima-docbook-textmarker/src/docbook/tools.textmarker.overview.xml?rev=1363750&view=auto
==============================================================================
--- uima/sandbox/trunk/TextMarker/uima-docbook-textmarker/src/docbook/tools.textmarker.overview.xml (added)
+++ uima/sandbox/trunk/TextMarker/uima-docbook-textmarker/src/docbook/tools.textmarker.overview.xml Fri Jul 20 12:27:14 2012
@@ -0,0 +1,402 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE chapter PUBLIC "-//OASIS//DTD DocBook XML V4.4//EN"
+"http://www.oasis-open.org/docbook/xml/4.4/docbookx.dtd"[
+<!ENTITY imgroot "images/tools/tools.textmarker/" >
+<!ENTITY % uimaents SYSTEM "../../target/docbook-shared/entities.ent" >  
+%uimaents;
+]>
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+<chapter id="ugr.tools.tm.introduction">
+    <title>TextMarker</title>
+    <para>The TextMarker system is an open source tool
+        for the development
+        of rule-based information extraction applications.
+        The development
+        environment is based on the DLTK framework. It
+        supports the knowledge
+        engineer with a full-featured rule editor,
+        components for the
+        explanation of the rule inference and a build
+        process for generic UIMA
+        Analysis Engines and Type Systems.
+        Therefore TextMarker components can
+        be easily created and combined
+        with other UIMA components in different
+        information extraction
+        pipelines rather flexibly.
+
+        TextMarker applies a
+        specialized rule representation language for the effective
+        knowledge
+        formalization:
+        The rules of the TextMarker language are composed of a
+        list of rule
+        elements that themselves consists of four parts: The
+        mandatory
+        matching condition establishes a connection to the input
+        document by
+        referring to an already existing concept, respectively
+        annotation.
+        The
+        optional quantifier defines the usage of the matching
+        condition
+        similar to regular expressions. Then, additional conditions
+        add
+        constraints to the matched text fragment and additional actions
+        determine the consequences of the rule. Therefore, TextMarker rules
+        match on a pattern of given annotations and, if the additional
+        conditions evaluate true, then they execute their actions, e.g.
+        create
+        a new annotation. If no initial annotations exist, for example,
+        created by another component, a scanner is used to seed simple token
+        annotations contained in a taxonomy.
+
+        The TextMarker system provides
+        unique functionality that is usually not
+        found in similar systems. The
+        actions are able to modify the document
+        either by replacing or
+        deleting text fragments or by filtering the
+        view on the document. In
+        this case, the rules ignore some
+        annotations,
+        e.g. HTML markup, or are
+        executed only on the remaining text passages.
+        The knowledge engineer
+        is able to add heuristic knowledge by using
+        scoring rules.
+        Additionally, several language elements common to
+        scripting languages
+        like conditioned statements, loops, procedures,
+        recursion, variables
+        and expressions increase the expressiveness of
+        the language. Rules are
+        able to directly invoke external rule sets or
+        arbitrary UIMA Analysis
+        Engines and foreign libraries can be
+        integrated with the extension
+        mechanism for new language elements.
+
+    </para>
+    <section id="ugr.tools.tm.introduction.metaphor">
+        <title>Introduction</title>
+        <para>
+            In manual information extraction humans often apply a strategy
+            according to a highlighter metaphor: First relevant headlines are
+            considered and classified according to their content by coloring
+            them
+            with different highlighters. The paragraphs of the annotated
+            headlines
+            are then considered further. Relevant text fragments or
+            single words
+            in the context of that headline can then be colored. In
+            this way, a
+            top-down analysis and extraction strategy is implemented.
+            Necessary
+            additional information can then be added that either refers
+            to other
+            text segments or contains valuable domain specific
+            information.
+            Finally the colored text can be easily analyzed
+            concerning the
+            relevant information.
+
+            The TextMarker system (textmarker
+            is a common german word for a
+            highlighter) tries to imitate this
+            manual extraction method by
+            formalizing the appropriate actions using
+            matching rules: The rules
+            mark sequences of words, extract text
+            segments or modify the input
+            document depending on textual
+            features.The default input for the
+            TextMarker system is
+            semi-structured text, but it can also process
+            structured or free
+            text. Technically, HTML is often the input
+            format,
+            since most word
+            processing documents can be converted to HTML.
+            Additionally, the
+            TextMarker systems offers the possibility to
+            create
+            a modified output
+            document.
+        </para>
+    </section>
+    <section id="ugr.tools.tm.introduction.concepts">
+        <title>Core Concepts</title>
+        <para>
+            As a first step in the extraction process the TextMarker system uses
+            a
+            tokenizer (scanner) to tokenize the input document and to create a
+            stream of basic symbols. The types and valid annotations of the
+            possible tokens are predefined by a taxonomy of annotation types.
+            Annotations simply refer to a section of the input document and
+            assign a type or concept to the respective text fragment. The figure
+            on the right shows an excerpt of a basic annotation taxonomy: CW
+            describes all tokens, for example, that contains a single word
+            starting with a capital letter, MARKUP corresponds to HTML or XML
+            tags, and PM refers to all kinds of punctuations marks. Take a look
+            at [basic annotations|BasicAnnotationList] for a complete list of
+            initial annotations.
+
+
+            <screenshot>
+                <mediaobject>
+                    <imageobject>
+                        <imagedata scale="80" format="PNG" fileref="&imgroot;symboltaxo.png" />
+                    </imageobject>
+                    <textobject>
+                        <phrase>Part of a taxonomy for basic annotation types.</phrase>
+                    </textobject>
+                </mediaobject>
+            </screenshot>
+
+            By using (and extending) the taxonomy, the knowledge engineer is
+            able
+            to choose the most adequate types and concepts when defining new
+            matching rules, i.e., TextMarker rules for matching a text fragment
+            given by a set of symbols to an annotation. If the capitalization of
+            a word, for example, is of no importance, then the annotation type W
+            that describes words of any kind can be used. The initial scanner
+            creates a set of basic annotations that may be used by the matching
+            rules of the TextMarker language. However, most information
+            extraction applications require domain specific concepts and
+            annotations. Therefore, the knowledge engineer is able to extend the
+            set of annotations, and to define new annotation types tuned to the
+            requirements of the given domain. These types can be flexibly
+            integrated in the taxonomy of annotation types.
+
+            One of the goals in
+            developing a new information extraction language
+            was
+            to maintain an
+            easily readable syntax while still providing a
+            scalable
+            expressiveness of the language. Basically, the TextMarker
+            language
+            contains expressions for the definition of new annotation
+            types and
+            for defining new matching rules. The rules are defined by a
+            list of
+            rule elements.
+            Each rule element contains at least a basic matching
+            condition referring
+            to text fragments or already specified
+            annotations. Additionally a
+            list of conditions and actions may be
+            specified for a rule element.
+            Whereas the conditions describe
+            necessary attributes of the matched
+            text fragment, the actions point
+            to operations and assignments on
+            the
+            current fragments. These actions
+            will then only be executed if all
+            basic conditions matched on a text
+            fragment or the annotation and the
+            related conditions are fulfilled.
+        </para>
+    </section>
+    <section id="ugr.tools.tm.introduction.examples">
+        <title>Examples</title>
+        <para>
+            The usage of the language and its readability can be demonstrated by
+            simple examples:
+
+            <programlisting><![CDATA[
+                CW{INLIST('animals.txt') -> MARK(Animal)};
+                Animal "and" Animal{-> MARK(Animalpair, 1, 2, 3)};
+    ]]></programlisting>
+
+            The first rule looks at all capitalized words that are listed in an
+            external document animals.txt and creates a new annotation of the
+            type
+            animal using the boundaries of the matched word. The second rule
+            searches for an annotation of the type animal followed by the
+            literal
+            and and a second animal annotation. Then it will create a new
+            annotation animalpair covering the text segment that matched the
+            three
+            rule elements (the digit parameters refer to the number of
+            matched
+            rule element).
+
+            <programlisting><![CDATA[
+                Document{-> MARKFAST(Firstname, 'firstnames.txt')};
+                Firstname CW{-> MARK(Lastname)};
+                Paragraph{VOTE(Firstname, Lastname) -> LOG("Found more Firstnames than Lastnames")};
+    ]]></programlisting>
+
+            In this example, the first rule annotates all words that occur in
+            the
+            external document firstnames.txt with the type firstname. The
+            second
+            rule creates a lastname annotation for all capitalized word
+            that
+            follow a firstname annotation. The last rule finally processes
+            all
+            paragraph} annotations. If the VOTE condition counts more
+            firstname
+            than lastname annotations, then the rule writes a log entry
+            with a
+            predefined message.
+
+
+            <programlisting><![CDATA[
+                ANY+{PARTOF(Paragraph), CONTAINS(Delete, 50, 100, true) -> MARK(Delete)};
+                Firstname{-> MARK(Delete,1 , 2)} Lastname;
+                Delete{-> DEL};
+            ]]></programlisting>
+
+            Here, the first rule looks for sequences of any kind of tokens
+            except
+            markup and creates one annotation of the type delete for each
+            sequence, if the tokens are part of a paragraph annotation and
+            contains together already more than 50% of delete annoations. The +
+            signs indicate this greedy processing. The second rule annotates
+            first
+            names followed by last names with the type delete and the third
+            rule
+            simply deletes all text segments that are associated with that
+            delete
+            annotation.
+
+        </para>
+    </section>
+    <section id="ugr.tools.tm.introduction.features">
+        <title>Special Features</title>
+        <para>
+            The TextMarker language features some special characteristics
+            that are
+            usually not found in other rule-based information extraction
+            systems
+            or even shift it towards scripting languages. The possibility
+            of
+            creating new annotation types and integrating them into the
+            taxonomy
+            facilitates an even more modular development of information
+            extraction systems.
+
+            Read more about robust extraction using
+            filtering, complex control
+            structures and heuristic extraction using
+            scoring rules.
+        </para>
+    </section>
+    <section id="ugr.tools.tm.introduction.getstarted">
+        <title>Get started</title>
+        <para>
+            This section page gives you a short, technical introduction on
+            how to
+            get
+            started with TextMarker system and mostly just links the
+            information
+            of the other wiki pages. Some knowledge about the usage
+            of Eclipse and
+            central concepts of UIMA are useful. TextMarker
+            consists of the
+            TextMarker rule language (and of course the rule
+            inference) and the
+            TextMarker workbench. Additionally, the CEV plugin
+            is used to edit
+            and
+            visualize annotated text. The TextRuler system
+            with implementations of
+            well known rule learning methods and
+            development extension with
+            support for test-driven development are
+            already integrated.
+        </para>
+        <section id="ugr.tools.tm.introduction.getstarted.running">
+            <title>Up and running</title>
+            <para>
+                First of all, install the Workbench and read the introduction
+                and its
+                examples. In order to verify if the Workbench is correctly
+                installed,
+                take a look at Help-About Eclipse-Installation Details
+                and compare
+                the installed plugins with the plugins you copied into
+                the plugins
+                folder of your Eclipse application. Normally most of the
+                plugins do
+                not cause any troubles, but the CEV does because of the
+                XPCom and
+                XULRunner dependencies. You should at least get the XPCom
+                plugin up
+                and running. However, you cannot use the additional HTML
+                functionality without the XULRunner plugin. If the plugins of the
+                installation guide do not work properly and a google search for a
+                suiteable plugin is not successful, then write a mail to the user
+                list and we will try to solve the problem. If all plugins are
+                correctly installed, then start the Eclipse application and switch
+                to
+                the TextMarker perspective (Window-Open Perspective-Other...)
+            </para>
+        </section>
+        <section id="ugr.tools.tm.introduction.getstarted.example">
+            <title>Learn by example</title>
+            <para>
+                Having a running Workbench download the example project and
+                import/copy
+                this TextMarker project into your workspace. The project
+                contains
+                some simple rules for extraction the author, title and year
+                of
+                reference strings. Next, take a look at the project structure and
+                the
+                syntax and compare it with the example project and its contents.
+                Open
+                the Main.tm TextMarker script in the folder
+                script/de.uniwue.example
+                and press the Run button in the Eclipse
+                toolbar. The docments in
+                the
+                input folder will then be processed by
+                the Main.tm file and the
+                result of the information extraction task
+                is placed in the output
+                folder. As you can see, there are four
+                files: an xmiCAS for each
+                input file and a HTML file (the
+                modifed/colored result). Open one of
+                the .xmi files with the CAS
+                Editor plugin (-popup menu-Open with) and
+                select some checkboxes in
+                the Annotation Browser view.
+            </para>
+        </section>
+        <section id="ugr.tools.tm.introduction.getstarted.doit">
+            <title>Do it yourself</title>
+            <para>
+                Try to write some rules yourself. Read the description of the
+                available
+                language constructs, e.g., conditions and actions and use
+                the
+                explanation component in order to take a closer look at the rule
+                inference. Then finally, read the rest of this document.
+            </para>
+        </section>
+    </section>
+</chapter>
\ No newline at end of file