You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2010/09/02 16:46:48 UTC
svn commit: r991956 [4/6] - in /tika/trunk: src/site/apt/
tika-core/src/main/java/org/apache/tika/
tika-core/src/main/java/org/apache/tika/detect/
tika-core/src/main/java/org/apache/tika/metadata/
tika-core/src/main/java/org/apache/tika/parser/ tika-co...
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/sax/SafeContentHandler.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/sax/SafeContentHandler.java?rev=991956&r1=991955&r2=991956&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/sax/SafeContentHandler.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/sax/SafeContentHandler.java Thu Sep 2 14:46:46 2010
@@ -1,160 +1,160 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.sax;
-
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-/**
- * Content handler decorator that makes sure that the character events
- * ({@link #characters(char[], int, int)} or
- * {@link #ignorableWhitespace(char[], int, int)}) passed to the decorated
- * content handler contain only valid XML characters. All invalid characters
- * are replaced with spaces.
- * <p>
- * The XML standard defines the following Unicode character ranges as
- * valid XML characters:
- * <pre>
- * #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
- * </pre>
- * <p>
- * Note that currently this class only detects those invalid characters whose
- * UTF-16 representation fits a single char. Also, this class does not ensure
- * that the UTF-16 encoding of incoming characters is correct.
- */
-public class SafeContentHandler extends ContentHandlerDecorator {
-
- /**
- * Replacement for invalid characters.
- */
- private static final char[] REPLACEMENT = new char[] { ' ' };
-
- /**
- * Internal interface that allows both character and
- * ignorable whitespace content to be filtered the same way.
- */
- protected interface Output {
- void write(char[] ch, int start, int length) throws SAXException;
- }
-
- /**
- * Output through the {@link ContentHandler#characters(char[], int, int)}
- * method of the decorated content handler.
- */
- private final Output charactersOutput = new Output() {
- public void write(char[] ch, int start, int length)
- throws SAXException {
- SafeContentHandler.super.characters(ch, start, length);
- }
- };
-
- /**
- * Output through the
- * {@link ContentHandler#ignorableWhitespace(char[], int, int)}
- * method of the decorated content handler.
- */
- private final Output ignorableWhitespaceOutput = new Output() {
- public void write(char[] ch, int start, int length)
- throws SAXException {
- SafeContentHandler.super.ignorableWhitespace(ch, start, length);
- }
- };
-
- public SafeContentHandler(ContentHandler handler) {
- super(handler);
- }
-
- /**
- * Filters and outputs the contents of the given input buffer. Any
- * invalid characters in the input buffer area handled by sending a
- * replacement (a space character) to the given output. Any sequences
- * of valid characters are passed as-is to the given output.
- *
- * @param ch input buffer
- * @param start start offset within the buffer
- * @param length number of characters to read from the buffer
- * @param output output channel
- * @throws SAXException if the filtered characters could not be written out
- */
- private void filter(char[] ch, int start, int length, Output output)
- throws SAXException {
- int end = start + length;
-
- for (int i = start; i < end; i++) {
- if (isInvalid(ch[i])) {
- // Output any preceding valid characters
- if (i > start) {
- output.write(ch, start, i - start);
- }
-
- // Output the replacement for this invalid character
- writeReplacement(output);
-
- // Continue with the rest of the array
- start = i + 1;
- }
- }
-
- // Output any remaining valid characters
- output.write(ch, start, end - start);
- }
-
- /**
- * Checks whether the given character (more accurately a UTF-16 code unit)
- * is an invalid XML character and should be replaced for output.
- * Subclasses can override this method to use an alternative definition
- * of which characters should be replaced in the XML output.
- *
- * @param ch character
- * @return <code>true</code> if the character should be replaced,
- * <code>false</code> otherwise
- */
- protected boolean isInvalid(char ch) {
- // TODO: Correct handling of multi-word characters
- if (ch < 0x20) {
- return ch != 0x09 && ch != 0x0A && ch != 0x0D;
- } else {
- return ch >= 0xFFFE;
- }
- }
-
- /**
- * Outputs the replacement for an invalid character. Subclasses can
- * override this method to use a custom replacement.
- *
- * @param output where the replacement is written to
- * @throws SAXException if the replacement could not be written
- */
- protected void writeReplacement(Output output) throws SAXException {
- output.write(REPLACEMENT, 0, REPLACEMENT.length);
- }
-
- //------------------------------------------------------< ContentHandler >
-
- @Override
- public void characters(char[] ch, int start, int length)
- throws SAXException {
- filter(ch, start, length, charactersOutput);
- }
-
- @Override
- public void ignorableWhitespace(char[] ch, int start, int length)
- throws SAXException {
- filter(ch, start, length, ignorableWhitespaceOutput);
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.sax;
+
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Content handler decorator that makes sure that the character events
+ * ({@link #characters(char[], int, int)} or
+ * {@link #ignorableWhitespace(char[], int, int)}) passed to the decorated
+ * content handler contain only valid XML characters. All invalid characters
+ * are replaced with spaces.
+ * <p>
+ * The XML standard defines the following Unicode character ranges as
+ * valid XML characters:
+ * <pre>
+ * #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
+ * </pre>
+ * <p>
+ * Note that currently this class only detects those invalid characters whose
+ * UTF-16 representation fits a single char. Also, this class does not ensure
+ * that the UTF-16 encoding of incoming characters is correct.
+ */
+public class SafeContentHandler extends ContentHandlerDecorator {
+
+ /**
+ * Replacement for invalid characters.
+ */
+ private static final char[] REPLACEMENT = new char[] { ' ' };
+
+ /**
+ * Internal interface that allows both character and
+ * ignorable whitespace content to be filtered the same way.
+ */
+ protected interface Output {
+ void write(char[] ch, int start, int length) throws SAXException;
+ }
+
+ /**
+ * Output through the {@link ContentHandler#characters(char[], int, int)}
+ * method of the decorated content handler.
+ */
+ private final Output charactersOutput = new Output() {
+ public void write(char[] ch, int start, int length)
+ throws SAXException {
+ SafeContentHandler.super.characters(ch, start, length);
+ }
+ };
+
+ /**
+ * Output through the
+ * {@link ContentHandler#ignorableWhitespace(char[], int, int)}
+ * method of the decorated content handler.
+ */
+ private final Output ignorableWhitespaceOutput = new Output() {
+ public void write(char[] ch, int start, int length)
+ throws SAXException {
+ SafeContentHandler.super.ignorableWhitespace(ch, start, length);
+ }
+ };
+
+ public SafeContentHandler(ContentHandler handler) {
+ super(handler);
+ }
+
+ /**
+ * Filters and outputs the contents of the given input buffer. Any
+ * invalid characters in the input buffer area handled by sending a
+ * replacement (a space character) to the given output. Any sequences
+ * of valid characters are passed as-is to the given output.
+ *
+ * @param ch input buffer
+ * @param start start offset within the buffer
+ * @param length number of characters to read from the buffer
+ * @param output output channel
+ * @throws SAXException if the filtered characters could not be written out
+ */
+ private void filter(char[] ch, int start, int length, Output output)
+ throws SAXException {
+ int end = start + length;
+
+ for (int i = start; i < end; i++) {
+ if (isInvalid(ch[i])) {
+ // Output any preceding valid characters
+ if (i > start) {
+ output.write(ch, start, i - start);
+ }
+
+ // Output the replacement for this invalid character
+ writeReplacement(output);
+
+ // Continue with the rest of the array
+ start = i + 1;
+ }
+ }
+
+ // Output any remaining valid characters
+ output.write(ch, start, end - start);
+ }
+
+ /**
+ * Checks whether the given character (more accurately a UTF-16 code unit)
+ * is an invalid XML character and should be replaced for output.
+ * Subclasses can override this method to use an alternative definition
+ * of which characters should be replaced in the XML output.
+ *
+ * @param ch character
+ * @return <code>true</code> if the character should be replaced,
+ * <code>false</code> otherwise
+ */
+ protected boolean isInvalid(char ch) {
+ // TODO: Correct handling of multi-word characters
+ if (ch < 0x20) {
+ return ch != 0x09 && ch != 0x0A && ch != 0x0D;
+ } else {
+ return ch >= 0xFFFE;
+ }
+ }
+
+ /**
+ * Outputs the replacement for an invalid character. Subclasses can
+ * override this method to use a custom replacement.
+ *
+ * @param output where the replacement is written to
+ * @throws SAXException if the replacement could not be written
+ */
+ protected void writeReplacement(Output output) throws SAXException {
+ output.write(REPLACEMENT, 0, REPLACEMENT.length);
+ }
+
+ //------------------------------------------------------< ContentHandler >
+
+ @Override
+ public void characters(char[] ch, int start, int length)
+ throws SAXException {
+ filter(ch, start, length, charactersOutput);
+ }
+
+ @Override
+ public void ignorableWhitespace(char[] ch, int start, int length)
+ throws SAXException {
+ filter(ch, start, length, ignorableWhitespaceOutput);
+ }
+
+}
Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/sax/SafeContentHandler.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/sax/TaggedContentHandler.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/sax/TaggedContentHandler.java?rev=991956&r1=991955&r2=991956&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/sax/TaggedContentHandler.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/sax/TaggedContentHandler.java Thu Sep 2 14:46:46 2010
@@ -1,116 +1,116 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.sax;
-
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-/**
- * A content handler decorator that tags potential exceptions so that the
- * handler that caused the exception can easily be identified. This is
- * done by using the {@link TaggedSAXException} class to wrap all thrown
- * {@link SAXException}s. See below for an example of using this class.
- * <pre>
- * TaggedContentHandler handler = new TaggedContentHandler(...);
- * try {
- * // Processing that may throw an SAXException either from this handler
- * // or from some other XML parsing activity
- * processXML(handler);
- * } catch (SAXException e) {
- * if (handler.isCauseOf(e)) {
- * // The exception was caused by this handler.
- * // Use e.getCause() to get the original exception.
- * } else {
- * // The exception was caused by something else.
- * }
- * }
- * </pre>
- * <p>
- * Alternatively, the {@link #throwIfCauseOf(Exception)} method can be
- * used to let higher levels of code handle the exception caused by this
- * stream while other processing errors are being taken care of at this
- * lower level.
- * <pre>
- * TaggedContentHandler handler = new TaggedContentHandler(...);
- * try {
- * processXML(handler);
- * } catch (SAXException e) {
- * stream.throwIfCauseOf(e);
- * // ... or process the exception that was caused by something else
- * }
- * </pre>
- *
- * @see TaggedSAXException
- */
-public class TaggedContentHandler extends ContentHandlerDecorator {
-
- /**
- * Creates a tagging decorator for the given content handler.
- *
- * @param proxy content handler to be decorated
- */
- public TaggedContentHandler(ContentHandler proxy) {
- super(proxy);
- }
-
- /**
- * Tests if the given exception was caused by this handler.
- *
- * @param exception an exception
- * @return <code>true</code> if the exception was thrown by this handler,
- * <code>false</code> otherwise
- */
- public boolean isCauseOf(SAXException exception) {
- if (exception instanceof TaggedSAXException) {
- TaggedSAXException tagged = (TaggedSAXException) exception;
- return this == tagged.getTag();
- } else {
- return false;
- }
- }
-
- /**
- * Re-throws the original exception thrown by this handler. This method
- * first checks whether the given exception is a {@link TaggedSAXException}
- * wrapper created by this decorator, and then unwraps and throws the
- * original wrapped exception. Returns normally if the exception was
- * not thrown by this handler.
- *
- * @param exception an exception
- * @throws SAXException original exception, if any, thrown by this handler
- */
- public void throwIfCauseOf(Exception exception) throws SAXException {
- if (exception instanceof TaggedSAXException) {
- TaggedSAXException tagged = (TaggedSAXException) exception;
- if (this == tagged.getTag()) {
- throw tagged.getCause();
- }
- }
- }
-
- /**
- * Tags any {@link SAXException}s thrown, wrapping and re-throwing.
- *
- * @param e The SAXException thrown
- * @throws SAXException if an XML error occurs
- */
- @Override
- protected void handleException(SAXException e) throws SAXException {
- throw new TaggedSAXException(e, this);
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.sax;
+
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * A content handler decorator that tags potential exceptions so that the
+ * handler that caused the exception can easily be identified. This is
+ * done by using the {@link TaggedSAXException} class to wrap all thrown
+ * {@link SAXException}s. See below for an example of using this class.
+ * <pre>
+ * TaggedContentHandler handler = new TaggedContentHandler(...);
+ * try {
+ * // Processing that may throw an SAXException either from this handler
+ * // or from some other XML parsing activity
+ * processXML(handler);
+ * } catch (SAXException e) {
+ * if (handler.isCauseOf(e)) {
+ * // The exception was caused by this handler.
+ * // Use e.getCause() to get the original exception.
+ * } else {
+ * // The exception was caused by something else.
+ * }
+ * }
+ * </pre>
+ * <p>
+ * Alternatively, the {@link #throwIfCauseOf(Exception)} method can be
+ * used to let higher levels of code handle the exception caused by this
+ * stream while other processing errors are being taken care of at this
+ * lower level.
+ * <pre>
+ * TaggedContentHandler handler = new TaggedContentHandler(...);
+ * try {
+ * processXML(handler);
+ * } catch (SAXException e) {
+ * stream.throwIfCauseOf(e);
+ * // ... or process the exception that was caused by something else
+ * }
+ * </pre>
+ *
+ * @see TaggedSAXException
+ */
+public class TaggedContentHandler extends ContentHandlerDecorator {
+
+ /**
+ * Creates a tagging decorator for the given content handler.
+ *
+ * @param proxy content handler to be decorated
+ */
+ public TaggedContentHandler(ContentHandler proxy) {
+ super(proxy);
+ }
+
+ /**
+ * Tests if the given exception was caused by this handler.
+ *
+ * @param exception an exception
+ * @return <code>true</code> if the exception was thrown by this handler,
+ * <code>false</code> otherwise
+ */
+ public boolean isCauseOf(SAXException exception) {
+ if (exception instanceof TaggedSAXException) {
+ TaggedSAXException tagged = (TaggedSAXException) exception;
+ return this == tagged.getTag();
+ } else {
+ return false;
+ }
+ }
+
+ /**
+ * Re-throws the original exception thrown by this handler. This method
+ * first checks whether the given exception is a {@link TaggedSAXException}
+ * wrapper created by this decorator, and then unwraps and throws the
+ * original wrapped exception. Returns normally if the exception was
+ * not thrown by this handler.
+ *
+ * @param exception an exception
+ * @throws SAXException original exception, if any, thrown by this handler
+ */
+ public void throwIfCauseOf(Exception exception) throws SAXException {
+ if (exception instanceof TaggedSAXException) {
+ TaggedSAXException tagged = (TaggedSAXException) exception;
+ if (this == tagged.getTag()) {
+ throw tagged.getCause();
+ }
+ }
+ }
+
+ /**
+ * Tags any {@link SAXException}s thrown, wrapping and re-throwing.
+ *
+ * @param e The SAXException thrown
+ * @throws SAXException if an XML error occurs
+ */
+ @Override
+ protected void handleException(SAXException e) throws SAXException {
+ throw new TaggedSAXException(e, this);
+ }
+
+}
Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/sax/TaggedContentHandler.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/sax/TaggedSAXException.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/sax/TaggedSAXException.java?rev=991956&r1=991955&r2=991956&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/sax/TaggedSAXException.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/sax/TaggedSAXException.java Thu Sep 2 14:46:46 2010
@@ -1,65 +1,65 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.sax;
-
-import org.xml.sax.SAXException;
-
-/**
- * A {@link SAXException} wrapper that tags the wrapped exception with
- * a given object reference. Both the tag and the wrapped original exception
- * can be used to determine further processing when this exception is caught.
- */
-public class TaggedSAXException extends SAXException {
-
- /**
- * The object reference used to tag the exception.
- */
- private final Object tag;
-
- /**
- * Creates a tagged wrapper for the given exception.
- *
- * @param original the exception to be tagged
- * @param tag tag object
- */
- public TaggedSAXException(SAXException original, Object tag) {
- super(original.getMessage(), original);
- initCause(original); // SAXException has it's own chaining mechanism!
- this.tag = tag;
- }
-
- /**
- * Returns the object reference used as the tag this exception.
- *
- * @return tag object
- */
- public Object getTag() {
- return tag;
- }
-
- /**
- * Returns the wrapped exception. The only difference to the overridden
- * {@link Throwable#getCause()} method is the narrower return type.
- *
- * @return wrapped exception
- */
- @Override
- public SAXException getCause() {
- return (SAXException) super.getCause();
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.sax;
+
+import org.xml.sax.SAXException;
+
+/**
+ * A {@link SAXException} wrapper that tags the wrapped exception with
+ * a given object reference. Both the tag and the wrapped original exception
+ * can be used to determine further processing when this exception is caught.
+ */
+public class TaggedSAXException extends SAXException {
+
+ /**
+ * The object reference used to tag the exception.
+ */
+ private final Object tag;
+
+ /**
+ * Creates a tagged wrapper for the given exception.
+ *
+ * @param original the exception to be tagged
+ * @param tag tag object
+ */
+ public TaggedSAXException(SAXException original, Object tag) {
+ super(original.getMessage(), original);
+ initCause(original); // SAXException has it's own chaining mechanism!
+ this.tag = tag;
+ }
+
+ /**
+ * Returns the object reference used as the tag this exception.
+ *
+ * @return tag object
+ */
+ public Object getTag() {
+ return tag;
+ }
+
+ /**
+ * Returns the wrapped exception. The only difference to the overridden
+ * {@link Throwable#getCause()} method is the narrower return type.
+ *
+ * @return wrapped exception
+ */
+ @Override
+ public SAXException getCause() {
+ return (SAXException) super.getCause();
+ }
+
+}
Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/sax/TaggedSAXException.java
------------------------------------------------------------------------------
svn:eol-style = native