You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by si...@apache.org on 2006/06/20 21:11:00 UTC
svn commit: r415772 - in /lucene/nutch/trunk/src:
java/org/apache/nutch/searcher/OpenSearchServlet.java
test/org/apache/nutch/searcher/TestOpenSearchServlet.java
Author: siren
Date: Tue Jun 20 12:11:00 2006
New Revision: 415772
URL: http://svn.apache.org/viewvc?rev=415772&view=rev
Log:
NUTCH-110 fix illegal xml output contributed by stack@archive.org
Added:
lucene/nutch/trunk/src/test/org/apache/nutch/searcher/TestOpenSearchServlet.java
Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/OpenSearchServlet.java
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/OpenSearchServlet.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/OpenSearchServlet.java?rev=415772&r1=415771&r2=415772&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/OpenSearchServlet.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/OpenSearchServlet.java Tue Jun 20 12:11:00 2006
@@ -262,23 +262,57 @@
private static void addNode(Document doc, Node parent,
String name, String text) {
Element child = doc.createElement(name);
- child.appendChild(doc.createTextNode(text));
+ child.appendChild(doc.createTextNode(getLegalXml(text)));
parent.appendChild(child);
}
private static void addNode(Document doc, Node parent,
String ns, String name, String text) {
Element child = doc.createElementNS((String)NS_MAP.get(ns), ns+":"+name);
- child.appendChild(doc.createTextNode(text));
+ child.appendChild(doc.createTextNode(getLegalXml(text)));
parent.appendChild(child);
}
private static void addAttribute(Document doc, Element node,
String name, String value) {
Attr attribute = doc.createAttribute(name);
- attribute.setValue(value);
+ attribute.setValue(getLegalXml(value));
node.getAttributes().setNamedItem(attribute);
}
-}
+ /*
+ * Ensure string is legal xml.
+ * @param text String to verify.
+ * @return Passed <code>text</code> or a new string with illegal
+ * characters removed if any found in <code>text</code>.
+ * @see http://www.w3.org/TR/2000/REC-xml-20001006#NT-Char
+ */
+ protected static String getLegalXml(final String text) {
+ if (text == null) {
+ return null;
+ }
+ StringBuffer buffer = null;
+ for (int i = 0; i < text.length(); i++) {
+ char c = text.charAt(i);
+ if (!isLegalXml(c)) {
+ if (buffer == null) {
+ // Start up a buffer. Copy characters here from now on
+ // now we've found at least one bad character in original.
+ buffer = new StringBuffer(text.length());
+ buffer.append(text.substring(0, i));
+ }
+ } else {
+ if (buffer != null) {
+ buffer.append(c);
+ }
+ }
+ }
+ return (buffer != null)? buffer.toString(): text;
+ }
+
+ private static boolean isLegalXml(final char c) {
+ return c == 0x9 || c == 0xa || c == 0xd || (c >= 0x20 && c <= 0xd7ff)
+ || (c >= 0xe000 && c <= 0xfffd) || (c >= 0x10000 && c <= 0x10ffff);
+ }
+}
Added: lucene/nutch/trunk/src/test/org/apache/nutch/searcher/TestOpenSearchServlet.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/searcher/TestOpenSearchServlet.java?rev=415772&view=auto
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/searcher/TestOpenSearchServlet.java (added)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/searcher/TestOpenSearchServlet.java Tue Jun 20 12:11:00 2006
@@ -0,0 +1,17 @@
+package org.apache.nutch.searcher;
+
+import junit.framework.TestCase;
+
+public class TestOpenSearchServlet extends TestCase {
+
+ /**
+ * Test removing of illegal xml chars from string
+ */
+ public void testGetLegalXml(){
+ assertEquals("hello",OpenSearchServlet.getLegalXml("hello"));
+ assertEquals("hello",OpenSearchServlet.getLegalXml("he\u0000llo"));
+ assertEquals("hello",OpenSearchServlet.getLegalXml("\u0000he\u0000llo"));
+ assertEquals("hello",OpenSearchServlet.getLegalXml("\u0000he\u0000llo\u0000"));
+ }
+
+}