You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@uima.apache.org by pk...@apache.org on 2013/03/21 14:44:06 UTC
svn commit: r1459309 [2/2] - in /uima/sandbox/textmarker/trunk: textmarker-core/src/main/antlr3/org/apache/uima/textmarker/parser/ textmarker-core/src/main/java/org/apache/uima/textmarker/ textmarker-core/src/main/java/org/apache/uima/textmarker/action...

Modified: uima/sandbox/textmarker/trunk/textmarker-docbook/src/docbook/tools.textmarker.language.syntax.xml
URL: http://svn.apache.org/viewvc/uima/sandbox/textmarker/trunk/textmarker-docbook/src/docbook/tools.textmarker.language.syntax.xml?rev=1459309&r1=1459308&r2=1459309&view=diff
==============================================================================
--- uima/sandbox/textmarker/trunk/textmarker-docbook/src/docbook/tools.textmarker.language.syntax.xml (original)
+++ uima/sandbox/textmarker/trunk/textmarker-docbook/src/docbook/tools.textmarker.language.syntax.xml Thu Mar 21 13:44:04 2013
@@ -83,7 +83,11 @@ BlockDeclaration    -> "BLOCK" "(" Ident
                                                        "{" Statements "}"]]></programlisting>
 
     Syntax of statements and rule elements:
-    <programlisting><![CDATA[SimpleStatement        -> RuleElements ";"
+    <programlisting><![CDATA[SimpleStatement        -> RuleElements ";" | RegExpRule ";"
+RegExpRule             -> StringExpression "->" GroupAssignment 
+                          ("," GroupAssignment)*
+GroupAssignment        -> TypeExpression 
+                        | NumberEpxression "=" TypeExpression
 RuleElements           -> RuleElement+
 RuleElement            -> RuleElementType | RuleElementLiteral
                         | RuleElementComposed | RuleElementDisjunctive

Modified: uima/sandbox/textmarker/trunk/textmarker-docbook/src/docbook/tools.textmarker.language.xml
URL: http://svn.apache.org/viewvc/uima/sandbox/textmarker/trunk/textmarker-docbook/src/docbook/tools.textmarker.language.xml?rev=1459309&r1=1459308&r2=1459309&view=diff
==============================================================================
--- uima/sandbox/textmarker/trunk/textmarker-docbook/src/docbook/tools.textmarker.language.xml (original)
+++ uima/sandbox/textmarker/trunk/textmarker-docbook/src/docbook/tools.textmarker.language.xml Thu Mar 21 13:44:04 2013
@@ -561,4 +561,31 @@ Document{->MARKTABLE(PresidentOfUSA, 1, 
       </para>
     </section>
   </section>
+  <section id="ugr.tools.tm.language.regexprule">
+    <title>Simple Rules based on Regular Expressions</title>
+    <para>
+      The TextMarker langugae includes, additionally to the normal rules, a simplified rule syntax for processing regular expressions.
+      These simple rules consist of two parts separated by <quote>-></quote>: The left part is the regular expression 
+      (flags: DOTALL and MULTILINE), which may contain capturing groups. The right part defines, which kind of annotations 
+      should be created for each match of the regular expression. If a type is given without a group index, then an annotation of that type is
+      created for the complete regular expression match, which corresponds to group 0. These simple rules can be restricted to match only within
+      certain annotations using the BLOCK construct, and ignore all filtering settings.
+    </para>
+    
+    <programlisting><![CDATA[
+RegExpRule      -> StringExpression "->" GroupAssignment 
+                  ("," GroupAssignment)* ";"
+GroupAssignment -> TypeExpression | NumberEpxression "=" TypeExpression
+]]></programlisting>
+    
+    <para>
+      The following example contains a simple rule, which is able to create annotations of two different types. It creates an annotation 
+      of the type <quote>T1</quote> for each match of the complete regular expression and an annotation 
+      of the type <quote>T2</quote> for each match of the first capturing group.
+    </para>
+    
+    <programlisting><![CDATA["A(.*?)C" -> T1, 1 = T2;]]></programlisting>
+    
+    
+  </section>
 </chapter>
\ No newline at end of file

Modified: uima/sandbox/textmarker/trunk/textmarker-docbook/src/docbook/tools.textmarker.overview.xml
URL: http://svn.apache.org/viewvc/uima/sandbox/textmarker/trunk/textmarker-docbook/src/docbook/tools.textmarker.overview.xml?rev=1459309&r1=1459308&r2=1459309&view=diff
==============================================================================
--- uima/sandbox/textmarker/trunk/textmarker-docbook/src/docbook/tools.textmarker.overview.xml (original)
+++ uima/sandbox/textmarker/trunk/textmarker-docbook/src/docbook/tools.textmarker.overview.xml Thu Mar 21 13:44:04 2013
@@ -179,7 +179,15 @@ W{REGEXP("dog") -> MARK(Animal)};]]></pr
       The default seeder does actually not add annotations of the type <quote>W</quote>, but annotations of the types <quote>SW</quote> and 
       <quote>CW</quote> for small written words and capitalized words, which both have the parent type <quote>W</quote>.
     </para>
-
+    
+    <para>
+      There is also a special kind of rules, which follow a different syntax and semantic, and enables a simplified creation of annotations based on regular expression.
+      The following rule, for example, creates an <quote>Animal</quote> annotation for each occurrence of <quote>dog</quote> or <quote>cat</quote>.
+    </para>
+    
+    <programlisting><![CDATA[DECLARE Animal;
+"dog|cat" -> Animal;]]></programlisting>
+    
     <para>
       Since it is tedious to create Animal annotations by matching on different regular expression, we apply an external dictionary in the next example.
       The first line defines a word list named <quote>AnimalsList</quote>, which is located in the resource folder (the file <quote>Animals.txt</quote> 

Modified: uima/sandbox/textmarker/trunk/textmarker-ep-addons/src/main/java/org/apache/uima/textmarker/explain/tree/ExplainTree.java
URL: http://svn.apache.org/viewvc/uima/sandbox/textmarker/trunk/textmarker-ep-addons/src/main/java/org/apache/uima/textmarker/explain/tree/ExplainTree.java?rev=1459309&r1=1459308&r2=1459309&view=diff
==============================================================================
--- uima/sandbox/textmarker/trunk/textmarker-ep-addons/src/main/java/org/apache/uima/textmarker/explain/tree/ExplainTree.java (original)
+++ uima/sandbox/textmarker/trunk/textmarker-ep-addons/src/main/java/org/apache/uima/textmarker/explain/tree/ExplainTree.java Thu Mar 21 13:44:04 2013
@@ -238,9 +238,11 @@ public class ExplainTree {
 
     Feature feature = ruleMatchType.getFeatureByBaseName(ExplainConstants.ELEMENTS);
     ArrayFS value = (ArrayFS) fs.getFeatureValue(feature);
-    FeatureStructure[] fsarray = value.toArray();
-    for (FeatureStructure each : fsarray) {
-      buildTree(each, remRoot, ts, offset, onlyRules);
+    if (value != null) {
+      FeatureStructure[] fsarray = value.toArray();
+      for (FeatureStructure each : fsarray) {
+        buildTree(each, remRoot, ts, offset, onlyRules);
+      }
     }
   }
 
@@ -271,15 +273,16 @@ public class ExplainTree {
 
     feature = ruleElementMatchType.getFeatureByBaseName(ExplainConstants.CONDITIONS);
     ArrayFS value = (ArrayFS) fs.getFeatureValue(feature);
-    FeatureStructure[] fsarray = value.toArray();
-    for (FeatureStructure each : fsarray) {
-      buildTree(each, remNode, ts, offset, onlyRules);
+    if (value != null) {
+      FeatureStructure[] fsarray = value.toArray();
+      for (FeatureStructure each : fsarray) {
+        buildTree(each, remNode, ts, offset, onlyRules);
+      }
     }
-
     feature = fs.getType().getFeatureByBaseName(ExplainConstants.ELEMENTS);
     value = (ArrayFS) fs.getFeatureValue(feature);
     if (value != null) {
-      fsarray = value.toArray();
+      FeatureStructure[] fsarray = value.toArray();
       for (FeatureStructure each : fsarray) {
         buildTree(each, remNode, ts, offset, onlyRules);
       }

Modified: uima/sandbox/textmarker/trunk/textmarker-ep-addons/src/main/java/org/apache/uima/textmarker/explain/tree/RuleElementMatchNode.java
URL: http://svn.apache.org/viewvc/uima/sandbox/textmarker/trunk/textmarker-ep-addons/src/main/java/org/apache/uima/textmarker/explain/tree/RuleElementMatchNode.java?rev=1459309&r1=1459308&r2=1459309&view=diff
==============================================================================
--- uima/sandbox/textmarker/trunk/textmarker-ep-addons/src/main/java/org/apache/uima/textmarker/explain/tree/RuleElementMatchNode.java (original)
+++ uima/sandbox/textmarker/trunk/textmarker-ep-addons/src/main/java/org/apache/uima/textmarker/explain/tree/RuleElementMatchNode.java Thu Mar 21 13:44:04 2013
@@ -41,11 +41,13 @@ public class RuleElementMatchNode extend
 
     f = fs.getType().getFeatureByBaseName(ExplainConstants.CONDITIONS);
     ArrayFS value = (ArrayFS) fs.getFeatureValue(f);
-    FeatureStructure[] fsarray = value.toArray();
-    for (FeatureStructure each : fsarray) {
-      Feature eachFeat = each.getType().getFeatureByBaseName(ExplainConstants.VALUE);
-      boolean eachValue = each.getBooleanValue(eachFeat);
-      matched &= eachValue;
+    if (value != null) {
+      FeatureStructure[] fsarray = value.toArray();
+      for (FeatureStructure each : fsarray) {
+        Feature eachFeat = each.getType().getFeatureByBaseName(ExplainConstants.VALUE);
+        boolean eachValue = each.getBooleanValue(eachFeat);
+        matched &= eachValue;
+      }
     }
   }
 

Modified: uima/sandbox/textmarker/trunk/textmarker-ep-ide/src/main/antlr3/org/apache/uima/textmarker/ide/core/parser/TextMarkerParser.g
URL: http://svn.apache.org/viewvc/uima/sandbox/textmarker/trunk/textmarker-ep-ide/src/main/antlr3/org/apache/uima/textmarker/ide/core/parser/TextMarkerParser.g?rev=1459309&r1=1459308&r2=1459309&view=diff
==============================================================================
--- uima/sandbox/textmarker/trunk/textmarker-ep-ide/src/main/antlr3/org/apache/uima/textmarker/ide/core/parser/TextMarkerParser.g (original)
+++ uima/sandbox/textmarker/trunk/textmarker-ep-ide/src/main/antlr3/org/apache/uima/textmarker/ide/core/parser/TextMarkerParser.g Thu Mar 21 13:44:04 2013
@@ -566,12 +566,42 @@ ruleElementWithoutCA returns [TextMarker
 		
 simpleStatement returns [TextMarkerRule stmt = null]
 	: 
+	(regexpRule)=> rer = regexpRule {stmt = rer;}
+	|
 	elements=ruleElements 
 		s = SEMI 
 		{stmt = scriptFactory.createRule(elements, s);}
 		
 	;
 
+regexpRule returns [TextMarkerRule stmt = null]
+@init{
+	List<Expression> exprs = new ArrayList<Expression>();
+}
+	:
+	regexp = stringExpression {exprs.add(regexp);} {stmt = scriptFactory.createRule(exprs, s);} THEN
+	(
+	te = typeExpression {exprs.add(te);} {stmt = scriptFactory.createRule(exprs, s);}
+	|
+	indexCG = numberExpression {exprs.add(indexCG);}{stmt = scriptFactory.createRule(exprs, s);} ASSIGN_EQUAL indexTE = typeExpression {exprs.add(indexTE);}
+	)
+	(
+	COMMA
+	(
+	te = typeExpression {exprs.add(te);}{stmt = scriptFactory.createRule(exprs, s);}
+	|
+	indexCG = numberExpression {exprs.add(indexCG);}{stmt = scriptFactory.createRule(exprs, s);} ASSIGN_EQUAL indexTE = typeExpression {exprs.add(indexTE);}
+	)
+	
+	)*
+
+	s = SEMI
+	{stmt = scriptFactory.createRule(exprs, s);}
+	
+	;
+
+
+
 ruleElements returns [List<Expression> elements = new ArrayList<Expression>()]
 	:
 	re = ruleElement {if(re!=null) elements.add(re);} (re = ruleElement {if(re!=null) elements.add(re);})*