You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/03/29 22:16:35 UTC
svn commit: r1307091 - in /lucene/dev/branches/lucene3930/modules/benchmark:
ivy.xml lib/ lib/XERCESJ-1257_PLUS_JAVA7.patch.txt
lib/xercesImpl-2.9.1-patched-XERCESJ-1257.jar
src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java
Author: rmuir
Date: Thu Mar 29 20:16:35 2012
New Revision: 1307091
URL: http://svn.apache.org/viewvc?rev=1307091&view=rev
Log:
LUCENE-3937: use a stock xerces jar file, but decode UTF-8 ourselves
Removed:
lucene/dev/branches/lucene3930/modules/benchmark/lib/XERCESJ-1257_PLUS_JAVA7.patch.txt
lucene/dev/branches/lucene3930/modules/benchmark/lib/xercesImpl-2.9.1-patched-XERCESJ-1257.jar
Modified:
lucene/dev/branches/lucene3930/modules/benchmark/ivy.xml
lucene/dev/branches/lucene3930/modules/benchmark/lib/ (props changed)
lucene/dev/branches/lucene3930/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java
Modified: lucene/dev/branches/lucene3930/modules/benchmark/ivy.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3930/modules/benchmark/ivy.xml?rev=1307091&r1=1307090&r2=1307091&view=diff
==============================================================================
--- lucene/dev/branches/lucene3930/modules/benchmark/ivy.xml (original)
+++ lucene/dev/branches/lucene3930/modules/benchmark/ivy.xml Thu Mar 29 20:16:35 2012
@@ -20,5 +20,6 @@
<info organisation="org.apache.lucene" module="benchmark"/>
<dependencies>
<dependency org="org.apache.commons" name="commons-compress" rev="1.2" transitive="false"/>
+ <dependency org="xerces" name="xercesImpl" rev="2.9.1" transitive="false"/>
</dependencies>
</ivy-module>
Modified: lucene/dev/branches/lucene3930/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3930/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java?rev=1307091&r1=1307090&r2=1307091&view=diff
==============================================================================
--- lucene/dev/branches/lucene3930/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java (original)
+++ lucene/dev/branches/lucene3930/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java Thu Mar 29 20:16:35 2012
@@ -20,12 +20,17 @@ package org.apache.lucene.benchmark.byTa
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CodingErrorAction;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.lucene.benchmark.byTask.utils.StreamUtils;
import org.apache.lucene.util.ThreadInterruptedException;
+import org.apache.lucene.util.IOUtils;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
@@ -172,7 +177,11 @@ public class EnwikiContentSource extends
while(true){
final InputStream localFileIS = is;
try {
- reader.parse(new InputSource(localFileIS));
+ // To work around a bug in XERCES (XERCESJ-1257), we assume the XML is always UTF8, so we simply provide reader.
+ CharsetDecoder decoder = IOUtils.CHARSET_UTF_8.newDecoder()
+ .onMalformedInput(CodingErrorAction.REPORT)
+ .onUnmappableCharacter(CodingErrorAction.REPORT);
+ reader.parse(new InputSource(new BufferedReader(new InputStreamReader(localFileIS, decoder))));
} catch (IOException ioe) {
synchronized(EnwikiContentSource.this) {
if (localFileIS != is) {