You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2011/09/02 19:45:05 UTC

svn commit: r1164655 - in /tika/trunk/tika-core/src: main/java/org/apache/tika/sax/SafeContentHandler.java test/java/org/apache/tika/sax/SafeContentHandlerTest.java

Author: jukka
Date: Fri Sep  2 17:45:04 2011
New Revision: 1164655

URL: http://svn.apache.org/viewvc?rev=1164655&view=rev
Log:
TIKA-698: "Invalid UTF-16 surrogate detected:" parsing PowerPoint 97-2003

Proper handling of surrogate pairs in SafeContentHandler

Filter also attribute values

Modified:
    tika/trunk/tika-core/src/main/java/org/apache/tika/sax/SafeContentHandler.java
    tika/trunk/tika-core/src/test/java/org/apache/tika/sax/SafeContentHandlerTest.java

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/sax/SafeContentHandler.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/sax/SafeContentHandler.java?rev=1164655&r1=1164654&r2=1164655&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/sax/SafeContentHandler.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/sax/SafeContentHandler.java Fri Sep  2 17:45:04 2011
@@ -16,8 +16,10 @@
  */
 package org.apache.tika.sax;
 
+import org.xml.sax.Attributes;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
 
 /**
  * Content handler decorator that makes sure that the character events
@@ -51,6 +53,20 @@ public class SafeContentHandler extends 
         void write(char[] ch, int start, int length) throws SAXException;
     }
 
+    private static class StringOutput implements Output {
+
+        private final StringBuilder builder = new StringBuilder();
+
+        public void write(char[] ch, int start, int length) {
+            builder.append(ch, start, length);
+        }
+
+        public String toString() {
+            return builder.toString();
+        }
+
+    }
+
     /**
      * Output through the {@link ContentHandler#characters(char[], int, int)}
      * method of the decorated content handler.
@@ -94,8 +110,12 @@ public class SafeContentHandler extends 
             throws SAXException {
         int end = start + length;
 
-        for (int i = start; i < end; i++) {
-            if (isInvalid(ch[i])) {
+        int i = start;
+        while (i < end) {
+            int c = Character.codePointAt(ch, i, end);
+            int j = i + Character.charCount(c);
+
+            if (isInvalid(c)) {
                 // Output any preceding valid characters
                 if (i > start) {
                     output.write(ch, start, i - start);
@@ -105,8 +125,10 @@ public class SafeContentHandler extends 
                 writeReplacement(output);
 
                 // Continue with the rest of the array
-                start = i + 1;
+                start = j;
             }
+
+            i = j;
         }
 
         // Output any remaining valid characters
@@ -114,25 +136,60 @@ public class SafeContentHandler extends 
     }
 
     /**
-     * Checks whether the given character (more accurately a UTF-16 code unit)
-     * is an invalid XML character and should be replaced for output.
-     * Subclasses can override this method to use an alternative definition
-     * of which characters should be replaced in the XML output.
+     * Checks if the given string contains any invalid XML characters.
+     *
+     * @param value string to be checked
+     * @return <code>true</code> if the string contains invalid XML characters,
+     *         <code>false</code> otherwise
+     */
+    private boolean isInvalid(String value) {
+        char[] ch = value.toCharArray();
+
+        int i = 0;
+        while (i < ch.length) {
+            int c = Character.codePointAt(ch, i);
+            if (isInvalid(c)) {
+                return true;
+            }
+            i = i + Character.charCount(c);
+        }
+
+        return false;
+    }
+
+    /**
+     * Checks whether the given Unicode character is an invalid XML character
+     * and should be replaced for output. Subclasses can override this method
+     * to use an alternative definition of which characters should be replaced
+     * in the XML output. The default definition from the XML specification is:
+     * <pre>
+     * Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
+     * </pre>
      *
      * @param ch character
      * @return <code>true</code> if the character should be replaced,
      *         <code>false</code> otherwise
      */
-    protected boolean isInvalid(char ch) {
-        // TODO: Correct handling of multi-word characters
+    protected boolean isInvalid(int ch) {
         if (ch < 0x20) {
             return ch != 0x09 && ch != 0x0A && ch != 0x0D;
+        } else if (ch < 0xE000) {
+            return ch > 0xD7FF;
+        } else if (ch < 0x10000) {
+            return ch > 0xFFFD;
         } else {
-            return ch >= 0xFFFE;
+            return ch > 0x10FFFF;
         }
     }
 
     /**
+     * @deprecated Use {@link #isInvalid(int)} instead
+     */
+    protected boolean isInvalid(char ch) {
+        return isInvalid((int) ch);
+    }
+
+    /**
      * Outputs the replacement for an invalid character. Subclasses can
      * override this method to use a custom replacement.
      *
@@ -146,6 +203,34 @@ public class SafeContentHandler extends 
     //------------------------------------------------------< ContentHandler >
 
     @Override
+    public void startElement(
+            String uri, String localName, String name, Attributes atts)
+            throws SAXException {
+        // Look for any invalid characters in attribute values.
+        for (int i = 0; i < atts.getLength(); i++) {
+            if (isInvalid(atts.getValue(i))) {
+                // Found an invalid character, so need to filter the attributes
+                AttributesImpl filtered = new AttributesImpl();
+                for (int j = 0; j < atts.getLength(); j++) {
+                    String value = atts.getValue(j);
+                    if (j >= i && isInvalid(value)) {
+                        // Filter the attribute value when needed
+                        Output buffer = new StringOutput();
+                        filter(value.toCharArray(), 0, value.length(), buffer);
+                        value = buffer.toString();
+                    }
+                    filtered.addAttribute(
+                            atts.getURI(j), atts.getLocalName(j),
+                            atts.getQName(j), atts.getType(j), value);
+                }
+                atts = filtered;
+                break;
+            }
+        }
+        super.startElement(uri, localName, name, atts);
+    }
+
+    @Override
     public void characters(char[] ch, int start, int length)
             throws SAXException {
         filter(ch, start, length, charactersOutput);

Modified: tika/trunk/tika-core/src/test/java/org/apache/tika/sax/SafeContentHandlerTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/sax/SafeContentHandlerTest.java?rev=1164655&r1=1164654&r2=1164655&view=diff
==============================================================================
--- tika/trunk/tika-core/src/test/java/org/apache/tika/sax/SafeContentHandlerTest.java (original)
+++ tika/trunk/tika-core/src/test/java/org/apache/tika/sax/SafeContentHandlerTest.java Fri Sep  2 17:45:04 2011
@@ -65,4 +65,9 @@ public class SafeContentHandlerTest exte
         assertEquals("ab a c bc", output.toString());
     }
 
+    public void testInvalidSurrogates() throws SAXException {
+        safe.ignorableWhitespace("\udb00\ubfff".toCharArray(), 0, 2);
+        assertEquals(" \ubfff", output.toString());
+    }
+
 }