You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by us...@apache.org on 2012/03/30 00:04:16 UTC
svn commit: r1307141 - in /lucene/dev/trunk: lucene/contrib/
modules/benchmark/lib/
modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/
Author: uschindler
Date: Thu Mar 29 22:04:15 2012
New Revision: 1307141
URL: http://svn.apache.org/viewvc?rev=1307141&view=rev
Log:
LUCENE-3937: Workaround a XERCES-J bug in benchmark module.
Added:
lucene/dev/trunk/modules/benchmark/lib/xercesImpl-2.9.1.jar (with props)
Removed:
lucene/dev/trunk/modules/benchmark/lib/lucene-xercesImpl-pom.xml.template
lucene/dev/trunk/modules/benchmark/lib/xercesImpl-2.9.1-patched-XERCESJ-1257.jar
Modified:
lucene/dev/trunk/lucene/contrib/CHANGES.txt
lucene/dev/trunk/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java
Modified: lucene/dev/trunk/lucene/contrib/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/CHANGES.txt?rev=1307141&r1=1307140&r2=1307141&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/contrib/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/contrib/CHANGES.txt Thu Mar 29 22:04:15 2012
@@ -293,6 +293,9 @@ Bug Fixes
* LUCENE-3894: ICUTokenizer, NGramTokenizer and EdgeNGramTokenizer
could stop early if the Reader only partially fills the provided
buffer. (Mike McCandless)
+
+ * LUCENE-3937: Workaround a XERCES-J bug in benchmark module.
+ (Uwe Schindler, Robert Muir, Mike McCandless)
Documentation
Added: lucene/dev/trunk/modules/benchmark/lib/xercesImpl-2.9.1.jar
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/benchmark/lib/xercesImpl-2.9.1.jar?rev=1307141&view=auto
==============================================================================
Binary file - no diff available.
Modified: lucene/dev/trunk/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java?rev=1307141&r1=1307140&r2=1307141&view=diff
==============================================================================
--- lucene/dev/trunk/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java (original)
+++ lucene/dev/trunk/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java Thu Mar 29 22:04:15 2012
@@ -20,12 +20,17 @@ package org.apache.lucene.benchmark.byTa
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CodingErrorAction;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.lucene.benchmark.byTask.utils.StreamUtils;
import org.apache.lucene.util.ThreadInterruptedException;
+import org.apache.lucene.util.IOUtils;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
@@ -172,7 +177,11 @@ public class EnwikiContentSource extends
while(true){
final InputStream localFileIS = is;
try {
- reader.parse(new InputSource(localFileIS));
+ // To work around a bug in XERCES (XERCESJ-1257), we assume the XML is always UTF8, so we simply provide reader.
+ CharsetDecoder decoder = IOUtils.CHARSET_UTF_8.newDecoder()
+ .onMalformedInput(CodingErrorAction.REPORT)
+ .onUnmappableCharacter(CodingErrorAction.REPORT);
+ reader.parse(new InputSource(new BufferedReader(new InputStreamReader(localFileIS, decoder))));
} catch (IOException ioe) {
synchronized(EnwikiContentSource.this) {
if (localFileIS != is) {