You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2009/01/13 22:51:55 UTC
svn commit: r734249 -
/lucene/tika/trunk/src/main/java/org/apache/tika/sax/SafeContentHandler.java
Author: jukka
Date: Tue Jan 13 13:51:52 2009
New Revision: 734249
URL: http://svn.apache.org/viewvc?rev=734249&view=rev
Log:
TIKA-180: XHTMLContentHandler unable to extract text from MSWord file
Improved javadocs on SafeContentHandler.
Detect U+FFFE and U+FFFF as invalid XML characters.
Modified:
lucene/tika/trunk/src/main/java/org/apache/tika/sax/SafeContentHandler.java
Modified: lucene/tika/trunk/src/main/java/org/apache/tika/sax/SafeContentHandler.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/sax/SafeContentHandler.java?rev=734249&r1=734248&r2=734249&view=diff
==============================================================================
--- lucene/tika/trunk/src/main/java/org/apache/tika/sax/SafeContentHandler.java (original)
+++ lucene/tika/trunk/src/main/java/org/apache/tika/sax/SafeContentHandler.java Tue Jan 13 13:51:52 2009
@@ -19,8 +19,28 @@
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
+/**
+ * Content handler decorator that makes sure that the character events
+ * ({@link #characters(char[], int, int)} or
+ * {@link #ignorableWhitespace(char[], int, int)}) passed to the decorated
+ * content handler contain only valid XML characters. All invalid characters
+ * are replaced with spaces.
+ * <p>
+ * The XML standard defines the following Unicode character ranges as
+ * valid XML characters:
+ * <pre>
+ * #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
+ * </pre>
+ * <p>
+ * Note that currently this class only detects those invalid characters whose
+ * UTF-16 representation fits a single char. Also, this class does not ensure
+ * that the UTF-16 encoding of incoming characters is correct.
+ */
public class SafeContentHandler extends ContentHandlerDecorator {
+ /**
+ * Replacement for invalid characters.
+ */
private static final char[] REPLACEMENT = new char[] { ' ' };
/**
@@ -58,6 +78,18 @@
super(handler);
}
+ /**
+ * Filters and outputs the contents of the given input buffer. Any
+ * invalid characters in the input buffer area handled by sending a
+ * replacement (a space character) to the given output. Any sequences
+ * of valid characters are passed as-is to the given output.
+ *
+ * @param ch input buffer
+ * @param start start offset within the buffer
+ * @param length number of characters to read from the buffer
+ * @param output output channel
+ * @throws SAXException if the filtered characters could not be written out
+ */
private void filter(char[] ch, int start, int length, Output output)
throws SAXException {
int end = start + length;
@@ -92,8 +124,12 @@
* <code>false</code> otherwise
*/
protected boolean isInvalid(char ch) {
- // TODO: Detect also FFFE, FFFF, and the surrogate blocks
- return ch < 0x20 && ch != 0x09 && ch != 0x0A && ch != 0x0D;
+ // TODO: Correct handling of multi-word characters
+ if (ch < 0x20) {
+ return ch != 0x09 && ch != 0x0A && ch != 0x0D;
+ } else {
+ return ch >= 0xFFFE;
+ }
}
/**