You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@uima.apache.org by tw...@apache.org on 2007/05/28 16:45:26 UTC

svn commit: r542240 - in /incubator/uima/uimaj/trunk/uimaj-core/src: main/java/org/apache/uima/cas/impl/XmiCasSerializer.java main/java/org/apache/uima/internal/util/XMLUtils.java test/java/org/apache/uima/cas/impl/XmiCasSerializerTest.java

Author: twgoetz
Date: Mon May 28 07:45:25 2007
New Revision: 542240

URL: http://svn.apache.org/viewvc?view=rev&rev=542240
Log:
Jira UIMA-387: modified XMI serializer to throw an exception when
trying to serialize invalid XML 1.0 characters.  Test case added.

http://issues.apache.org/jira/browse/UIMA-387

Added:
    incubator/uima/uimaj/trunk/uimaj-core/src/test/java/org/apache/uima/cas/impl/XmiCasSerializerTest.java
Modified:
    incubator/uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/cas/impl/XmiCasSerializer.java
    incubator/uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/internal/util/XMLUtils.java

Modified: incubator/uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/cas/impl/XmiCasSerializer.java
URL: http://svn.apache.org/viewvc/incubator/uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/cas/impl/XmiCasSerializer.java?view=diff&rev=542240&r1=542239&r2=542240
==============================================================================
--- incubator/uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/cas/impl/XmiCasSerializer.java (original)
+++ incubator/uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/cas/impl/XmiCasSerializer.java Mon May 28 07:45:25 2007
@@ -41,6 +41,7 @@
 import org.apache.uima.cas.impl.XmiSerializationSharedData.XmiArrayElement;
 import org.apache.uima.internal.util.IntStack;
 import org.apache.uima.internal.util.IntVector;
+import org.apache.uima.internal.util.XMLUtils;
 import org.apache.uima.internal.util.XmlAttribute;
 import org.apache.uima.internal.util.XmlElementName;
 import org.apache.uima.internal.util.XmlElementNameAndContents;
@@ -800,6 +801,7 @@
             break;
           }
         }
+        checkXml10String(attrValue);
         if (attrValue != null && featName != null) {
           addAttribute(attrs, featName, attrValue);
         }
@@ -822,10 +824,19 @@
       return childElements;
     }
 
+    private final void checkXml10String(String s) throws SAXParseException {
+      final int index = XMLUtils.checkForNonXmlCharacters(s);
+      if (index >= 0) {
+        throw new SAXParseException("Trying to serialize non-XML 1.0 character: " + s.charAt(index)
+            + ", 0x" + Integer.toHexString(s.charAt(index)), null);
+      }
+    }
+    
     private void addText(String text) throws SAXException {
+      checkXml10String(text);
       ch.characters(text.toCharArray(), 0, text.length());
     }
-
+    
     private void addAttribute(AttributesImpl attrs, String attrName, String attrValue) {
       attrs.addAttribute(null, null, attrName, cdataType, attrValue);
     }

Modified: incubator/uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/internal/util/XMLUtils.java
URL: http://svn.apache.org/viewvc/incubator/uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/internal/util/XMLUtils.java?view=diff&rev=542240&r1=542239&r2=542240
==============================================================================
--- incubator/uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/internal/util/XMLUtils.java (original)
+++ incubator/uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/internal/util/XMLUtils.java Mon May 28 07:45:25 2007
@@ -23,7 +23,6 @@
 import java.io.Writer;
 import java.lang.reflect.Constructor;
 
-import org.apache.uima.util.InvalidXMLException;
 import org.w3c.dom.Element;
 import org.w3c.dom.Node;
 import org.w3c.dom.NodeList;
@@ -76,26 +75,26 @@
           aResultBuf.append("&#").append((int) c).append(';');
         } else {
           switch (c) {
-            case '<':
-              aResultBuf.append("&lt;");
-              break;
-            case '>':
-              aResultBuf.append("&gt;");
-              break;
-            case '&':
-              aResultBuf.append("&amp;");
-              break;
-            case '"':
-              aResultBuf.append("&quot;");
-              break;
-            case '\n':
-              aResultBuf.append(aNewlinesToSpaces ? " " : "\n");
-              break;
-            case '\r':
-              aResultBuf.append(aNewlinesToSpaces ? " " : "\r");
-              break;
-            default:
-              aResultBuf.append(c);
+          case '<':
+            aResultBuf.append("&lt;");
+            break;
+          case '>':
+            aResultBuf.append("&gt;");
+            break;
+          case '&':
+            aResultBuf.append("&amp;");
+            break;
+          case '"':
+            aResultBuf.append("&quot;");
+            break;
+          case '\n':
+            aResultBuf.append(aNewlinesToSpaces ? " " : "\n");
+            break;
+          case '\r':
+            aResultBuf.append(aNewlinesToSpaces ? " " : "\r");
+            break;
+          default:
+            aResultBuf.append(c);
           }
         }
       }
@@ -119,7 +118,7 @@
    *           if an I/O failure occurs when writing to <code>aWriter</code>
    */
   public static void writeNormalizedString(String aStr, Writer aWriter, boolean aNewlinesToSpaces)
-          throws IOException {
+      throws IOException {
     if (aStr == null)
       return;
 
@@ -127,26 +126,26 @@
     for (int i = 0; i < len; i++) {
       char c = aStr.charAt(i);
       switch (c) {
-        case '<':
-          aWriter.write("&lt;");
-          break;
-        case '>':
-          aWriter.write("&gt;");
-          break;
-        case '&':
-          aWriter.write("&amp;");
-          break;
-        case '"':
-          aWriter.write("&quot;");
-          break;
-        case '\n':
-          aWriter.write(aNewlinesToSpaces ? " " : "\n");
-          break;
-        case '\r':
-          aWriter.write(aNewlinesToSpaces ? " " : "\r");
-          break;
-        default:
-          aWriter.write(c);
+      case '<':
+        aWriter.write("&lt;");
+        break;
+      case '>':
+        aWriter.write("&gt;");
+        break;
+      case '&':
+        aWriter.write("&amp;");
+        break;
+      case '"':
+        aWriter.write("&quot;");
+        break;
+      case '\n':
+        aWriter.write(aNewlinesToSpaces ? " " : "\n");
+        break;
+      case '\r':
+        aWriter.write(aNewlinesToSpaces ? " " : "\r");
+        break;
+      default:
+        aWriter.write(c);
       }
     }
   }
@@ -207,7 +206,7 @@
    *           if the ContentHandler throws an exception
    */
   public static void writePrimitiveValue(Object aObj, ContentHandler aContentHandler)
-          throws SAXException {
+      throws SAXException {
     final Attributes EMPTY_ATTRIBUTES = new AttributesImpl();
 
     String className = aObj.getClass().getName();
@@ -278,7 +277,7 @@
    * @return the value that was read, <code>null</code> if a primitive value could not be
    *         constructed from the element
    */
-  public static Object readPrimitiveValue(Element aElem) throws InvalidXMLException {
+  public static Object readPrimitiveValue(Element aElem) {
     // the element's tag name is the lowercase name of the class, minus the
     // package name
     String tagName = aElem.getTagName();
@@ -374,4 +373,46 @@
 
     return buf.toString().trim();
   }
+
+  /**
+   * Check the input string for non-XML 1.0 characters. If non-XML characters are found, return the
+   * position of first offending character. Else, return <code>-1</code>.
+   * 
+   * @param s
+   *          Input string
+   * @return The position of the first invalid XML character encountered. <code>-1</code> if no
+   *         invalid XML characters found.
+   */
+  public static final int checkForNonXmlCharacters(String s) {
+    if (s == null) {
+      return -1;
+    }
+    for (int i = 0; i < s.length(); i++) {
+      if (!isValidXml10Char(s.charAt(i))) {
+        return i;
+      }
+    }
+    return -1;
+  }
+
+  // Check if the character we're looking at is a valid XML 1.0 character. From the XML 1.0 spec:
+  //
+  // Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] /* any Unicode
+  // character, excluding the surrogate blocks, FFFE, and FFFF. */
+  //
+  // And from the UTF-16 spec:
+  //
+  // Characters with values between 0x10000 and 0x10FFFF are
+  // represented by a 16-bit integer with a value between 0xD800 and
+  // 0xDBFF (within the so-called high-half zone or high surrogate
+  // area) followed by a 16-bit integer with a value between 0xDC00 and
+  // 0xDFFF (within the so-called low-half zone or low surrogate area).
+  //
+  // So it actually looks as if the surrogate case can be handled correctly by just looking at
+  // individual Java chars.
+  private static final boolean isValidXml10Char(char c) {
+    return ((c == 0x9) || (c == 0xA) || (c == 0xD) || ((c >= 0x20) && (c <= 0xD7FF)) || 
+        ((c >= 0xE000) && (c <= 0xFFFD)));
+  }
+
 }

Added: incubator/uima/uimaj/trunk/uimaj-core/src/test/java/org/apache/uima/cas/impl/XmiCasSerializerTest.java
URL: http://svn.apache.org/viewvc/incubator/uima/uimaj/trunk/uimaj-core/src/test/java/org/apache/uima/cas/impl/XmiCasSerializerTest.java?view=auto&rev=542240
==============================================================================
--- incubator/uima/uimaj/trunk/uimaj-core/src/test/java/org/apache/uima/cas/impl/XmiCasSerializerTest.java (added)
+++ incubator/uima/uimaj/trunk/uimaj-core/src/test/java/org/apache/uima/cas/impl/XmiCasSerializerTest.java Mon May 28 07:45:25 2007
@@ -0,0 +1,116 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.uima.cas.impl;
+
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.OutputStream;
+
+import junit.framework.TestCase;
+
+import org.apache.uima.UIMAFramework;
+import org.apache.uima.cas.CAS;
+import org.apache.uima.resource.metadata.TypeSystemDescription;
+import org.apache.uima.test.junit_extension.JUnitExtension;
+import org.apache.uima.util.CasCreationUtils;
+import org.apache.uima.util.XMLInputSource;
+import org.apache.uima.util.XMLSerializer;
+import org.xml.sax.SAXParseException;
+
+/**
+ * Test case for XMI serialization, in particular, invalid XML 1.0 characters. Other aspects of XMI
+ * serialization are tested elsewhere.
+ */
+public class XmiCasSerializerTest extends TestCase {
+
+  private TypeSystemDescription typeSystemDesc = null;
+
+  private File outputFile = null;
+
+  /**
+   * @param arg0
+   */
+  public XmiCasSerializerTest(String arg0) {
+    super(arg0);
+  }
+
+  /*
+   * (non-Javadoc)
+   * 
+   * @see junit.framework.TestCase#setUp()
+   */
+  protected void setUp() throws Exception {
+    File typeSystemFile = JUnitExtension.getFile("ExampleCas/testTypeSystem.xml");
+    // Temp output file, deleted on exit.
+    this.outputFile = new File(JUnitExtension.getFile("ExampleCas"),
+        "xmiSerializerInvalidCharsTestOutput.xmi");
+    System.out.println(this.outputFile.getAbsolutePath());
+    this.typeSystemDesc = UIMAFramework.getXMLParser().parseTypeSystemDescription(
+        new XMLInputSource(typeSystemFile));
+  }
+
+  public void testInvalidCharsInDocumentText() throws Exception {
+    CAS cas = CasCreationUtils.createCas(this.typeSystemDesc, null, null);
+    char badChar = 0x1A;
+    cas.setDocumentText("Text with bad char: " + badChar);
+    OutputStream out = new FileOutputStream(this.outputFile);
+    XMLSerializer xmlSerializer = new XMLSerializer(out);
+    XmiCasSerializer xmiCasSerializer = new XmiCasSerializer(cas.getTypeSystem());
+    boolean caughtException = false;
+    try {
+      xmiCasSerializer.serialize(cas, xmlSerializer.getContentHandler());
+    } catch (SAXParseException e) {
+      caughtException = true;
+    }
+    out.close();
+    assertTrue("XMI serialization of document text with bad XML 1.0 char should throw exception",
+        caughtException);
+  }
+
+  public void testInvalidCharsInFeatureValue() throws Exception {
+    CAS cas = CasCreationUtils.createCas(this.typeSystemDesc, null, null);
+    char badChar = 0x1A;
+    cas.setDocumentLanguage("a" + badChar);
+    OutputStream out = new FileOutputStream(this.outputFile);
+    XMLSerializer xmlSerializer = new XMLSerializer(out);
+    XmiCasSerializer xmiCasSerializer = new XmiCasSerializer(cas.getTypeSystem());
+    boolean caughtException = false;
+    try {
+      xmiCasSerializer.serialize(cas, xmlSerializer.getContentHandler());
+    } catch (SAXParseException e) {
+      caughtException = true;
+    }
+    out.close();
+    assertTrue("XMI serialization of document text with bad XML 1.0 char should throw exception",
+        caughtException);
+  }
+
+  /*
+   * (non-Javadoc)
+   * 
+   * @see junit.framework.TestCase#tearDown()
+   */
+  protected void tearDown() throws Exception {
+    if ((this.outputFile != null) && this.outputFile.exists()) {
+      this.outputFile.delete();
+    }
+  }
+
+}