You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by us...@apache.org on 2012/03/30 00:04:16 UTC

svn commit: r1307141 - in /lucene/dev/trunk: lucene/contrib/ modules/benchmark/lib/ modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/

Author: uschindler
Date: Thu Mar 29 22:04:15 2012
New Revision: 1307141

URL: http://svn.apache.org/viewvc?rev=1307141&view=rev
Log:
LUCENE-3937: Workaround a XERCES-J bug in benchmark module.

Added:
    lucene/dev/trunk/modules/benchmark/lib/xercesImpl-2.9.1.jar   (with props)
Removed:
    lucene/dev/trunk/modules/benchmark/lib/lucene-xercesImpl-pom.xml.template
    lucene/dev/trunk/modules/benchmark/lib/xercesImpl-2.9.1-patched-XERCESJ-1257.jar
Modified:
    lucene/dev/trunk/lucene/contrib/CHANGES.txt
    lucene/dev/trunk/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java

Modified: lucene/dev/trunk/lucene/contrib/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/CHANGES.txt?rev=1307141&r1=1307140&r2=1307141&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/contrib/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/contrib/CHANGES.txt Thu Mar 29 22:04:15 2012
@@ -293,6 +293,9 @@ Bug Fixes
  * LUCENE-3894: ICUTokenizer, NGramTokenizer and EdgeNGramTokenizer
    could stop early if the Reader only partially fills the provided
    buffer. (Mike McCandless) 
+   
+ * LUCENE-3937: Workaround a XERCES-J bug in benchmark module.
+   (Uwe Schindler, Robert Muir, Mike McCandless)
   
 Documentation
 

Added: lucene/dev/trunk/modules/benchmark/lib/xercesImpl-2.9.1.jar
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/benchmark/lib/xercesImpl-2.9.1.jar?rev=1307141&view=auto
==============================================================================
Binary file - no diff available.

Modified: lucene/dev/trunk/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java?rev=1307141&r1=1307140&r2=1307141&view=diff
==============================================================================
--- lucene/dev/trunk/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java (original)
+++ lucene/dev/trunk/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java Thu Mar 29 22:04:15 2012
@@ -20,12 +20,17 @@ package org.apache.lucene.benchmark.byTa
 import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CodingErrorAction;
 import java.util.HashMap;
 import java.util.Map;
 
 import org.apache.lucene.benchmark.byTask.utils.Config;
 import org.apache.lucene.benchmark.byTask.utils.StreamUtils;
 import org.apache.lucene.util.ThreadInterruptedException;
+import org.apache.lucene.util.IOUtils;
 import org.xml.sax.Attributes;
 import org.xml.sax.InputSource;
 import org.xml.sax.SAXException;
@@ -172,7 +177,11 @@ public class EnwikiContentSource extends
         while(true){
           final InputStream localFileIS = is;
           try {
-            reader.parse(new InputSource(localFileIS));
+            // To work around a bug in XERCES (XERCESJ-1257), we assume the XML is always UTF8, so we simply provide reader.
+            CharsetDecoder decoder = IOUtils.CHARSET_UTF_8.newDecoder()
+                .onMalformedInput(CodingErrorAction.REPORT)
+                .onUnmappableCharacter(CodingErrorAction.REPORT);
+            reader.parse(new InputSource(new BufferedReader(new InputStreamReader(localFileIS, decoder))));
           } catch (IOException ioe) {
             synchronized(EnwikiContentSource.this) {
               if (localFileIS != is) {