You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@uima.apache.org by pk...@apache.org on 2012/11/28 15:29:04 UTC

svn commit: r1414726 - in /uima/sandbox/trunk/TextMarker/uimaj-textmarker/src: main/java/org/apache/uima/textmarker/seed/ test/java/org/apache/uima/textmarker/seed/

Author: pkluegl
Date: Wed Nov 28 14:29:02 2012
New Revision: 1414726

URL: http://svn.apache.org/viewvc?rev=1414726&view=rev
Log:
UIMA-2508
- removed markup rules from lexer grammar
- replaced that functionality by a hotfix in the java implementation of the seeder
- lexer creates annotations ignoring markup, seeder uses regexp to find markup and then removes the other annotations

Modified:
    uima/sandbox/trunk/TextMarker/uimaj-textmarker/src/main/java/org/apache/uima/textmarker/seed/DefaultSeeder.java
    uima/sandbox/trunk/TextMarker/uimaj-textmarker/src/main/java/org/apache/uima/textmarker/seed/SeedLexer.flex
    uima/sandbox/trunk/TextMarker/uimaj-textmarker/src/main/java/org/apache/uima/textmarker/seed/SeedLexer.java
    uima/sandbox/trunk/TextMarker/uimaj-textmarker/src/test/java/org/apache/uima/textmarker/seed/DefaultSeederTest.java

Modified: uima/sandbox/trunk/TextMarker/uimaj-textmarker/src/main/java/org/apache/uima/textmarker/seed/DefaultSeeder.java
URL: http://svn.apache.org/viewvc/uima/sandbox/trunk/TextMarker/uimaj-textmarker/src/main/java/org/apache/uima/textmarker/seed/DefaultSeeder.java?rev=1414726&r1=1414725&r2=1414726&view=diff
==============================================================================
--- uima/sandbox/trunk/TextMarker/uimaj-textmarker/src/main/java/org/apache/uima/textmarker/seed/DefaultSeeder.java (original)
+++ uima/sandbox/trunk/TextMarker/uimaj-textmarker/src/main/java/org/apache/uima/textmarker/seed/DefaultSeeder.java Wed Nov 28 14:29:02 2012
@@ -21,18 +21,29 @@ package org.apache.uima.textmarker.seed;
 
 import java.io.BufferedReader;
 import java.io.StringReader;
+import java.util.Collection;
+import java.util.LinkedList;
+import java.util.regex.MatchResult;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 
 import org.apache.uima.cas.CAS;
 import org.apache.uima.cas.CASException;
+import org.apache.uima.cas.FSIterator;
 import org.apache.uima.cas.Type;
 import org.apache.uima.cas.text.AnnotationFS;
+import org.apache.uima.cas.text.AnnotationIndex;
 import org.apache.uima.jcas.JCas;
+import org.apache.uima.textmarker.type.MARKUP;
 import org.apache.uima.textmarker.type.TokenSeed;
 
 public class DefaultSeeder implements TextMarkerAnnotationSeeder {
 
-  public static final String seedType =  "org.apache.uima.textmarker.type.TokenSeed";
-  
+  public static final String seedType = "org.apache.uima.textmarker.type.TokenSeed";
+
+  private final Pattern markupPattern = Pattern
+          .compile("</?\\w+((\\s+\\w+(\\s*=\\s*(?:\".*?\"|'.*?'|[^'\">\\s]+))?)+\\s*|\\s*)/?>");
+
   public Type seed(String text, CAS cas) {
     Type result = null;
     JCas jCas = null;
@@ -40,7 +51,7 @@ public class DefaultSeeder implements Te
     try {
       jCas = cas.getJCas();
       size = jCas.getAnnotationIndex(TokenSeed.type).size();
-      result =  jCas.getTypeSystem().getType(seedType);
+      result = jCas.getTypeSystem().getType(seedType);
     } catch (CASException e1) {
     }
     // do not apply seeding if there are already annotations of this seed type
@@ -63,6 +74,26 @@ public class DefaultSeeder implements Te
       } catch (Exception e) {
       }
     }
+
+    // FIXME: lexer rules for html markup won't work. Therrfore, those rules where removed in the
+    // grammar and the functionality is included directly with regexp
+    Matcher matcher = markupPattern.matcher(text);
+    Collection<AnnotationFS> toRemove = new LinkedList<AnnotationFS>();
+    while (matcher.find()) {
+      int begin = matcher.start();
+      int end = matcher.end();
+      MARKUP markup = new MARKUP(jCas, begin, end);
+      markup.addToIndexes();
+      FSIterator<AnnotationFS> subiterator = cas.getAnnotationIndex(result).subiterator(markup);
+      while(subiterator.isValid()) {
+        AnnotationFS fs = subiterator.get();
+        toRemove.add(fs);
+        subiterator.moveToNext();
+      }
+    }
+    for (AnnotationFS each : toRemove) {
+      cas.removeFsFromIndexes(each);
+    }
     return result;
   }
 }

Modified: uima/sandbox/trunk/TextMarker/uimaj-textmarker/src/main/java/org/apache/uima/textmarker/seed/SeedLexer.flex
URL: http://svn.apache.org/viewvc/uima/sandbox/trunk/TextMarker/uimaj-textmarker/src/main/java/org/apache/uima/textmarker/seed/SeedLexer.flex?rev=1414726&r1=1414725&r2=1414726&view=diff
==============================================================================
--- uima/sandbox/trunk/TextMarker/uimaj-textmarker/src/main/java/org/apache/uima/textmarker/seed/SeedLexer.flex (original)
+++ uima/sandbox/trunk/TextMarker/uimaj-textmarker/src/main/java/org/apache/uima/textmarker/seed/SeedLexer.flex Wed Nov 28 14:29:02 2012
@@ -85,22 +85,7 @@ SPACE=[ \t]
                 return t;
     }
     
-    \<[/][A-Za-z][A-Za-z0-9]*[^>]*> {
-                MARKUP t = new MARKUP(cas);
-                t.setBegin(yychar);
-                t.setEnd(yychar + yytext().length());
-                
-                return t;
-    }
-                    
-    \<[A-Za-z][A-Za-z0-9]*[^>]*> {
-                MARKUP t = new MARKUP(cas);
-                t.setBegin(yychar);
-                t.setEnd(yychar + yytext().length());
-                
-                return t;
-    }
-                
+                                       
     \xA0|&nbsp;|&NBSP; {
                 NBSP t = new NBSP(cas);
                 t.setBegin(yychar);

Modified: uima/sandbox/trunk/TextMarker/uimaj-textmarker/src/main/java/org/apache/uima/textmarker/seed/SeedLexer.java
URL: http://svn.apache.org/viewvc/uima/sandbox/trunk/TextMarker/uimaj-textmarker/src/main/java/org/apache/uima/textmarker/seed/SeedLexer.java?rev=1414726&r1=1414725&r2=1414726&view=diff
==============================================================================
--- uima/sandbox/trunk/TextMarker/uimaj-textmarker/src/main/java/org/apache/uima/textmarker/seed/SeedLexer.java (original)
+++ uima/sandbox/trunk/TextMarker/uimaj-textmarker/src/main/java/org/apache/uima/textmarker/seed/SeedLexer.java Wed Nov 28 14:29:02 2012
@@ -1,4 +1,4 @@
-/* The following code was generated by JFlex 1.4.3 on 11.01.12 15:00 */
+/* The following code was generated by JFlex 1.4.3 on 28.11.12 14:06 */
 
 /*
  * Licensed to the Apache Software Foundation (ASF) under one
@@ -21,8 +21,12 @@
 
 
 package org.apache.uima.textmarker.seed;
+import java.util.*;
+import java.util.regex.*;
+
 import org.apache.uima.cas.text.AnnotationFS;
 import org.apache.uima.jcas.JCas;
+
 import org.apache.uima.textmarker.type.AMP;
 import org.apache.uima.textmarker.type.BREAK;
 import org.apache.uima.textmarker.type.CAP;
@@ -44,8 +48,8 @@ import org.apache.uima.textmarker.type.S
 /**
  * This class is a scanner generated by 
  * <a href="http://www.jflex.de/">JFlex</a> 1.4.3
- * on 11.01.12 15:00 from the specification file
- * <tt>D:/work/workspace-uima3/uimaj-ep-textmarker-engine/src/main/java/org/apache/uima/textmarker/seed/SeedLexer.flex</tt>
+ * on 28.11.12 14:06 from the specification file
+ * <tt>D:/work/workspace-textmarker-uima/uimaj-textmarker/src/main/java/org/apache/uima/textmarker/seed/SeedLexer.flex</tt>
  */
 class SeedLexer {
 
@@ -237,14 +241,12 @@ class SeedLexer {
     "\1\2\1\3\1\4\1\5\1\6\1\7\1\2\1\10"+
     "\1\2\1\11\1\12\4\13\1\14\4\3\1\15\1\16"+
     "\1\17\1\20\2\13\1\3\34\0\1\21\11\0\4\22"+
-    "\1\0\4\21\4\0\2\22\1\21\2\0\1\4\31\0"+
-    "\1\23\4\0\1\24\1\23\3\0\4\23\1\0\4\23"+
-    "\5\0\1\23\2\0\1\25\11\0\1\26\3\25\1\0"+
+    "\1\0\4\21\4\0\2\22\1\21\2\0\1\4\36\0"+
+    "\1\23\1\24\24\0\1\25\11\0\1\26\3\25\1\0"+
     "\1\27\3\25\5\0\1\25\14\0\4\13\11\0\2\13"+
     "\2\0\1\21\16\0\4\21\6\0\1\21\13\0\4\22"+
-    "\11\0\2\22\1\0\10\23\1\30\22\23\1\0\1\23"+
-    "\5\0\1\23\3\0\4\23\1\0\4\23\5\0\1\23"+
-    "\2\0\1\25\11\0\4\25\1\31\4\25\5\0\1\25"+
+    "\11\0\2\22\10\0\1\24\23\0\10\24\1\30\22\24"+
+    "\1\0\1\25\11\0\4\25\1\31\4\25\5\0\1\25"+
     "\2\0\1\25\11\0\1\25\1\32\2\25\1\31\4\25"+
     "\5\0\1\25\2\0\1\25\11\0\4\25\1\31\1\25"+
     "\1\33\2\25\5\0\1\25\2\0\1\25\11\0\2\25"+

Modified: uima/sandbox/trunk/TextMarker/uimaj-textmarker/src/test/java/org/apache/uima/textmarker/seed/DefaultSeederTest.java
URL: http://svn.apache.org/viewvc/uima/sandbox/trunk/TextMarker/uimaj-textmarker/src/test/java/org/apache/uima/textmarker/seed/DefaultSeederTest.java?rev=1414726&r1=1414725&r2=1414726&view=diff
==============================================================================
--- uima/sandbox/trunk/TextMarker/uimaj-textmarker/src/test/java/org/apache/uima/textmarker/seed/DefaultSeederTest.java (original)
+++ uima/sandbox/trunk/TextMarker/uimaj-textmarker/src/test/java/org/apache/uima/textmarker/seed/DefaultSeederTest.java Wed Nov 28 14:29:02 2012
@@ -58,8 +58,9 @@ public class DefaultSeederTest {
     Type type = seeder.seed(text, cas);
     assertEquals("org.apache.uima.textmarker.type.TokenSeed", type.getName());
     AnnotationIndex<AnnotationFS> annotationIndex = cas.getAnnotationIndex(type);
-    assertEquals(26, annotationIndex.size());
+    assertEquals(40, annotationIndex.size());
     FSIterator<AnnotationFS> iterator = annotationIndex.iterator();
+
     assertEquals("CW", iterator.next().getType().getShortName());
     assertEquals("SPACE", iterator.next().getType().getShortName());
     assertEquals("SW", iterator.next().getType().getShortName());
@@ -79,6 +80,7 @@ public class DefaultSeederTest {
     assertEquals("SPACE", iterator.next().getType().getShortName());
     assertEquals("SPECIAL", iterator.next().getType().getShortName());
     assertEquals("SW", iterator.next().getType().getShortName());
+    assertEquals("SPACE", iterator.next().getType().getShortName());
     assertEquals("EXCLAMATION", iterator.next().getType().getShortName());
     assertEquals("QUESTION", iterator.next().getType().getShortName());
     assertEquals("PERIOD", iterator.next().getType().getShortName());
@@ -100,4 +102,6 @@ public class DefaultSeederTest {
     assertEquals("MARKUP", iterator.next().getType().getShortName());
     assertEquals("BREAK", iterator.next().getType().getShortName());
   }
+  
+  
 }