You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/03/29 22:16:35 UTC

svn commit: r1307091 - in /lucene/dev/branches/lucene3930/modules/benchmark: ivy.xml lib/ lib/XERCESJ-1257_PLUS_JAVA7.patch.txt lib/xercesImpl-2.9.1-patched-XERCESJ-1257.jar src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java

Author: rmuir
Date: Thu Mar 29 20:16:35 2012
New Revision: 1307091

URL: http://svn.apache.org/viewvc?rev=1307091&view=rev
Log:
LUCENE-3937: use a stock xerces jar file, but decode UTF-8 ourselves

Removed:
    lucene/dev/branches/lucene3930/modules/benchmark/lib/XERCESJ-1257_PLUS_JAVA7.patch.txt
    lucene/dev/branches/lucene3930/modules/benchmark/lib/xercesImpl-2.9.1-patched-XERCESJ-1257.jar
Modified:
    lucene/dev/branches/lucene3930/modules/benchmark/ivy.xml
    lucene/dev/branches/lucene3930/modules/benchmark/lib/   (props changed)
    lucene/dev/branches/lucene3930/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java

Modified: lucene/dev/branches/lucene3930/modules/benchmark/ivy.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3930/modules/benchmark/ivy.xml?rev=1307091&r1=1307090&r2=1307091&view=diff
==============================================================================
--- lucene/dev/branches/lucene3930/modules/benchmark/ivy.xml (original)
+++ lucene/dev/branches/lucene3930/modules/benchmark/ivy.xml Thu Mar 29 20:16:35 2012
@@ -20,5 +20,6 @@
     <info organisation="org.apache.lucene" module="benchmark"/>
     <dependencies>
       <dependency org="org.apache.commons" name="commons-compress" rev="1.2" transitive="false"/>
+      <dependency org="xerces" name="xercesImpl" rev="2.9.1" transitive="false"/>
     </dependencies>
 </ivy-module>

Modified: lucene/dev/branches/lucene3930/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3930/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java?rev=1307091&r1=1307090&r2=1307091&view=diff
==============================================================================
--- lucene/dev/branches/lucene3930/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java (original)
+++ lucene/dev/branches/lucene3930/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java Thu Mar 29 20:16:35 2012
@@ -20,12 +20,17 @@ package org.apache.lucene.benchmark.byTa
 import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CodingErrorAction;
 import java.util.HashMap;
 import java.util.Map;
 
 import org.apache.lucene.benchmark.byTask.utils.Config;
 import org.apache.lucene.benchmark.byTask.utils.StreamUtils;
 import org.apache.lucene.util.ThreadInterruptedException;
+import org.apache.lucene.util.IOUtils;
 import org.xml.sax.Attributes;
 import org.xml.sax.InputSource;
 import org.xml.sax.SAXException;
@@ -172,7 +177,11 @@ public class EnwikiContentSource extends
         while(true){
           final InputStream localFileIS = is;
           try {
-            reader.parse(new InputSource(localFileIS));
+            // To work around a bug in XERCES (XERCESJ-1257), we assume the XML is always UTF8, so we simply provide reader.
+            CharsetDecoder decoder = IOUtils.CHARSET_UTF_8.newDecoder()
+                .onMalformedInput(CodingErrorAction.REPORT)
+                .onUnmappableCharacter(CodingErrorAction.REPORT);
+            reader.parse(new InputSource(new BufferedReader(new InputStreamReader(localFileIS, decoder))));
           } catch (IOException ioe) {
             synchronized(EnwikiContentSource.this) {
               if (localFileIS != is) {