You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@uima.apache.org by mb...@apache.org on 2008/07/09 15:14:02 UTC

svn commit: r675156 - in /incubator/uima/sandbox/trunk/OpenCalaisAnnotator/src/main/java/org/apache/uima/annotator/calais: OpenCalaisAnnotator.java RDFSaxHandler.java

Author: mbaessler
Date: Wed Jul  9 06:14:02 2008
New Revision: 675156

URL: http://svn.apache.org/viewvc?rev=675156&view=rev
Log:
UIMA-1108

add special character offset handling for <>"'&\n\r

https://issues.apache.org/jira/browse/UIMA-1108

Modified:
    incubator/uima/sandbox/trunk/OpenCalaisAnnotator/src/main/java/org/apache/uima/annotator/calais/OpenCalaisAnnotator.java
    incubator/uima/sandbox/trunk/OpenCalaisAnnotator/src/main/java/org/apache/uima/annotator/calais/RDFSaxHandler.java

Modified: incubator/uima/sandbox/trunk/OpenCalaisAnnotator/src/main/java/org/apache/uima/annotator/calais/OpenCalaisAnnotator.java
URL: http://svn.apache.org/viewvc/incubator/uima/sandbox/trunk/OpenCalaisAnnotator/src/main/java/org/apache/uima/annotator/calais/OpenCalaisAnnotator.java?rev=675156&r1=675155&r2=675156&view=diff
==============================================================================
--- incubator/uima/sandbox/trunk/OpenCalaisAnnotator/src/main/java/org/apache/uima/annotator/calais/OpenCalaisAnnotator.java (original)
+++ incubator/uima/sandbox/trunk/OpenCalaisAnnotator/src/main/java/org/apache/uima/annotator/calais/OpenCalaisAnnotator.java Wed Jul  9 06:14:02 2008
@@ -27,6 +27,7 @@
 import java.net.URL;
 import java.net.URLConnection;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.HashMap;
 import java.util.Iterator;
 
@@ -111,6 +112,8 @@
   private URL calaisService;
 
   private HashMap<String, Type> typeMapping;
+  
+  private String[] charsToReplace = {"<", ">", "\"", "'", "&"};
 
   public void process(CAS aCas) throws AnalysisEngineProcessException {
 
@@ -121,7 +124,14 @@
       BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(connection
               .getOutputStream(), "UTF-8"));
       writer.write(this.serviceParams);
-      writer.write(aCas.getDocumentText());
+      String modifiedText = aCas.getDocumentText();
+      for(int i = 0; i < this.charsToReplace.length; i++) {
+        modifiedText = modifiedText.replaceAll(this.charsToReplace[i], "");
+      }
+      modifiedText = modifiedText.replaceAll("\n", " ");
+      modifiedText = modifiedText.replaceAll("\r", " ");
+     
+      writer.write(modifiedText);
       writer.flush();
       writer.close();
 
@@ -138,26 +148,58 @@
               RdfXmlContent.getBytes(feedDoc.getXmlEncoding())));
 
       // create SAX handler
-      ArrayList<DescriptionElement> elements = new ArrayList<DescriptionElement>();
-      HashMap<String, DescriptionElement> subjectMap = new HashMap<String, DescriptionElement>();
+      HashMap<String, DescriptionElement> elements = new HashMap<String, DescriptionElement>();
+      ArrayList<DescriptionElement> subjectMap = new ArrayList<DescriptionElement>();
       Offset offset = new Offset();
       RDFSaxHandler saxHandler = new RDFSaxHandler(elements, subjectMap, offset);
 
       // parse RDF XML content returned by the calais service
       this.saxParser.parse(bufByteIn, saxHandler);
 
+      //check offset correction
+      String text = aCas.getDocumentText();
+      ArrayList<Integer> positionsList = new ArrayList<Integer>();
+      int index = -1;
+      for(int i = 0; i < this.charsToReplace.length; i++) {
+        index = text.indexOf(this.charsToReplace[i]);
+        while(index > -1) {
+          positionsList.add(index);
+          index = text.indexOf(this.charsToReplace[i],index + 1);
+        }
+      }
+     //now the positions list contains all positions where characters have been removed
+      Integer[] positions = positionsList.toArray(new Integer[]{});
+      
+      Arrays.sort(positions);
+            
       // analyze entities
-      Iterator<DescriptionElement> elementIt = elements.iterator();
+      Iterator<DescriptionElement> elementIt = subjectMap.iterator();
       while (elementIt.hasNext()) {
         DescriptionElement element = elementIt.next();
-        // if for the typeURL is a mapping available, create annotation in the CAS
-        Type currentType = this.typeMapping.get(element.getTypeURL());
+        
+        // retrieve subject URL, the subject URL must be equal to an about URL in the elements
+        // map to get the type of the current element
+        DescriptionElement typeElement = elements.get(element.getSubjectURL());
+        String typeURL = typeElement.getTypeURL();
+        
+        // get current CAS type for the type URL
+        Type currentType = this.typeMapping.get(typeURL);
+        
+        //if mapping is available, create an annotation
         if (currentType != null) {
-          // mapping is available, create annotation
           // get reference element that contains the annotation span
-          DescriptionElement refElement = subjectMap.get(element.getAboutURL());
-          int begin = refElement.getOffset() - offset.getOffset();
-          int end = begin + refElement.getLength();
+
+          int begin = element.getOffset() - offset.getOffset();
+          
+          //make begin offset correction
+          for(int i = 0; i < positions.length; i++) {
+            Integer pos = positions[i];
+            if(pos < begin) {
+              begin++;
+            }
+          }
+                  
+          int end = begin + element.getLength();
           // create annotation
           AnnotationFS annotFs = aCas.createAnnotation(currentType, begin, end);
           annotFs.setStringValue(this.calaisTypeFeat, element.getTypeURL().intern());
@@ -324,9 +366,9 @@
     // set processing directives
     buffer.append("<c:processingDirectives");
     // set parameter contentType = TEXT/TXT
-    buffer.append(" c:contentType=\"TEXT/TXT\"");
+    buffer.append(" c:contentType=\"TEXT/html\"");
     // set parameter outputFormat = XML/RDF
-    buffer.append(" c:outputFormat=\"XML/RDF\">");
+    buffer.append(" c:outputFormat=\"xml/rdf\">");
     // close processing directives
     buffer.append("</c:processingDirectives>");
 

Modified: incubator/uima/sandbox/trunk/OpenCalaisAnnotator/src/main/java/org/apache/uima/annotator/calais/RDFSaxHandler.java
URL: http://svn.apache.org/viewvc/incubator/uima/sandbox/trunk/OpenCalaisAnnotator/src/main/java/org/apache/uima/annotator/calais/RDFSaxHandler.java?rev=675156&r1=675155&r2=675156&view=diff
==============================================================================
--- incubator/uima/sandbox/trunk/OpenCalaisAnnotator/src/main/java/org/apache/uima/annotator/calais/RDFSaxHandler.java (original)
+++ incubator/uima/sandbox/trunk/OpenCalaisAnnotator/src/main/java/org/apache/uima/annotator/calais/RDFSaxHandler.java Wed Jul  9 06:14:02 2008
@@ -33,10 +33,10 @@
     private boolean enableDocument = false;
     private Offset offset;
     
-    private ArrayList<DescriptionElement> elements;
-    private HashMap<String, DescriptionElement> subjectMap;
+    private HashMap<String, DescriptionElement> elements;
+    private ArrayList<DescriptionElement> subjectMap;
   
-   public RDFSaxHandler(ArrayList<DescriptionElement> elements, HashMap<String, DescriptionElement> subjectMap, Offset offset) {
+   public RDFSaxHandler(HashMap<String, DescriptionElement> elements, ArrayList<DescriptionElement> subjectMap, Offset offset) {
      this.elements = elements;
      this.subjectMap = subjectMap;
      this.offset = offset;
@@ -52,9 +52,9 @@
          throws SAXException {
      
      if(qName.equals("rdf:Description")) {
-       this.elements.add(this.currentDesc);
+       this.elements.put(this.currentDesc.getAboutURL(), this.currentDesc);
        if(this.currentDesc.getSubjectURL() != null) {
-         this.subjectMap.put(this.currentDesc.getSubjectURL(), this.currentDesc);
+         this.subjectMap.add(this.currentDesc);
        }
      }