You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by ma...@apache.org on 2009/07/06 17:56:40 UTC
svn commit: r791528 - in /lucene/java/trunk/contrib/benchmark: ./
src/java/org/apache/lucene/benchmark/byTask/feeds/
src/test/org/apache/lucene/benchmark/byTask/feeds/
Author: markrmiller
Date: Mon Jul 6 15:56:39 2009
New Revision: 791528
URL: http://svn.apache.org/viewvc?rev=791528&view=rev
Log:
LUCENE-1730: Fix TrecContentSource to use ISO-8859-1 when reading the TREC files, unless a different encoding is specified. Additionally, ContentSource now supports a content.source.encoding parameter in the configuration file.
Modified:
lucene/java/trunk/contrib/benchmark/CHANGES.txt
lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ContentSource.java
lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocSource.java
lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java
lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/TrecContentSourceTest.java
Modified: lucene/java/trunk/contrib/benchmark/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/CHANGES.txt?rev=791528&r1=791527&r2=791528&view=diff
==============================================================================
--- lucene/java/trunk/contrib/benchmark/CHANGES.txt (original)
+++ lucene/java/trunk/contrib/benchmark/CHANGES.txt Mon Jul 6 15:56:39 2009
@@ -4,6 +4,12 @@
$Id:$
+7/6/2009
+ LUCENE-1730: Fix TrecContentSource to use ISO-8859-1 when reading the TREC files,
+ unless a different encoding is specified. Additionally, ContentSource now supports
+ a content.source.encoding parameter in the configuration file.
+ (Shai Erera via Mark Miller)
+
6/26/2009
LUCENE-1716: Added the following support:
doc.tokenized.norms: specifies whether to store norms
Modified: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ContentSource.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ContentSource.java?rev=791528&r1=791527&r2=791528&view=diff
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ContentSource.java (original)
+++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ContentSource.java Mon Jul 6 15:56:39 2009
@@ -34,9 +34,9 @@
/**
* Represents content from a specified source, such as TREC, Reuters etc. A
* {@link ContentSource} is responsible for creating {@link DocData} objects for
- * its documents to be consumed by {@link ToDeleteDocMaker}. It also keeps track of
- * various statistics, such as how many documents were generated, size in bytes
- * etc.
+ * its documents to be consumed by {@link ToDeleteDocMaker}. It also keeps track
+ * of various statistics, such as how many documents were generated, size in
+ * bytes etc.
* <p>
* Supports the following configuration parameters:
* <ul>
@@ -44,6 +44,9 @@
* forever (<b>default=true</b>).
* <li><b>content.source.verbose</b> - specifies whether messages should be
* output by the content source (<b>default=false</b>).
+ * <li><b>content.source.encoding</b> - specifies which encoding to use when
+ * reading the files of that content source. Certain implementations may define
+ * a default value if this parameter is not specified. (<b>default=null</b>).
* <li><b>content.source.log.step</b> - specifies for how many documents a
* message should be logged. If set to 0 it means no logging should occur.
* <b>NOTE:</b> if verbose is set to false, logging should not occur even if
@@ -71,6 +74,7 @@
protected boolean forever;
protected int logStep;
protected boolean verbose;
+ protected String encoding;
private CompressorStreamFactory csFactory = new CompressorStreamFactory();
@@ -196,6 +200,7 @@
forever = config.get("content.source.forever", true);
logStep = config.get("content.source.log.step", 0);
verbose = config.get("content.source.verbose", false);
+ encoding = config.get("content.source.encoding", null);
}
}
Modified: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocSource.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocSource.java?rev=791528&r1=791527&r2=791528&view=diff
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocSource.java (original)
+++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocSource.java Mon Jul 6 15:56:39 2009
@@ -39,6 +39,7 @@
* Config properties:
* <ul>
* <li>docs.file=<path to the file>
+ * <li>content.source.encoding - default to UTF-8.
* </ul>
*/
public class LineDocSource extends ContentSource {
@@ -54,7 +55,7 @@
reader.close();
}
InputStream is = getInputStream(file);
- reader = new BufferedReader(new InputStreamReader(is, "UTF-8"), BUFFER_SIZE);
+ reader = new BufferedReader(new InputStreamReader(is, encoding), BUFFER_SIZE);
} catch (IOException e) {
throw new RuntimeException(e);
}
@@ -111,6 +112,9 @@
throw new IllegalArgumentException("docs.file must be set");
}
file = new File(fileName).getAbsoluteFile();
+ if (encoding == null) {
+ encoding = "UTF-8";
+ }
}
}
Modified: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java?rev=791528&r1=791527&r2=791528&view=diff
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java (original)
+++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java Mon Jul 6 15:56:39 2009
@@ -47,10 +47,11 @@
* (<b>default=trec</b>).
* <li><b>html.parser</b> - specifies the {@link HTMLParser} class to use for
* parsing the TREC documents content (<b>default=DemoHTMLParser</b>).
+ * <li><b>content.source.encoding</b> - if not specified, ISO-8859-1 is used.
* </ul>
*/
public class TrecContentSource extends ContentSource {
- // TODO (3.0): change StringBuffer to StringBuffer
+ // TODO (3.0): change StringBuffer to StringBuilder
private static final class DateFormatInfo {
DateFormat[] dfs;
@@ -181,8 +182,8 @@
System.out.println("opening: " + f + " length: " + f.length());
}
try {
- GZIPInputStream zis = new GZIPInputStream(new FileInputStream(f), 1 << 16);
- reader = new BufferedReader(new InputStreamReader(zis), 1 << 16);
+ GZIPInputStream zis = new GZIPInputStream(new FileInputStream(f), BUFFER_SIZE);
+ reader = new BufferedReader(new InputStreamReader(zis, encoding), BUFFER_SIZE);
return;
} catch (Exception e) {
retries++;
@@ -334,6 +335,9 @@
// Should not get here. Throw runtime exception.
throw new RuntimeException(e);
}
+ if (encoding == null) {
+ encoding = "ISO-8859-1";
+ }
}
}
Modified: lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/TrecContentSourceTest.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/TrecContentSourceTest.java?rev=791528&r1=791527&r2=791528&view=diff
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/TrecContentSourceTest.java (original)
+++ lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/TrecContentSourceTest.java Mon Jul 6 15:56:39 2009
@@ -40,7 +40,7 @@
this.forever = forever;
}
- protected void openNextFile() throws NoMoreDataException, IOException {
+ void openNextFile() throws NoMoreDataException, IOException {
if (reader != null) {
if (!forever) {
throw new NoMoreDataException();