You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by rm...@apache.org on 2009/11/13 01:47:16 UTC
svn commit: r835677 - in /lucene/java/trunk/contrib/benchmark: CHANGES.txt
src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java
Author: rmuir
Date: Fri Nov 13 00:47:15 2009
New Revision: 835677
URL: http://svn.apache.org/viewvc?rev=835677&view=rev
Log:
LUCENE-2059: allow TrecContentSource not to change the docname
Modified:
lucene/java/trunk/contrib/benchmark/CHANGES.txt
lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java
Modified: lucene/java/trunk/contrib/benchmark/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/CHANGES.txt?rev=835677&r1=835676&r2=835677&view=diff
==============================================================================
--- lucene/java/trunk/contrib/benchmark/CHANGES.txt (original)
+++ lucene/java/trunk/contrib/benchmark/CHANGES.txt Fri Nov 13 00:47:15 2009
@@ -5,6 +5,13 @@
$Id:$
11/12/2009
+ LUCENE-2059: allow TrecContentSource not to change the docname.
+ Previously, it would always append the iteration # to the docname.
+ With the new option content.source.excludeIteration, you can disable this.
+ The resulting index can then be used with the quality package to measure
+ relevance. (Robert Muir)
+
+11/12/2009
LUCENE-2058: specify trec_eval submission output from the command line.
Previously, 4 arguments were required, but the third was unused. The
third argument is now the desired location of submission.txt (Robert Muir)
Modified: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java?rev=835677&r1=835676&r2=835677&view=diff
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java (original)
+++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java Fri Nov 13 00:47:15 2009
@@ -48,6 +48,7 @@
* <li><b>html.parser</b> - specifies the {@link HTMLParser} class to use for
* parsing the TREC documents content (<b>default=DemoHTMLParser</b>).
* <li><b>content.source.encoding</b> - if not specified, ISO-8859-1 is used.
+ * <li><b>content.source.excludeIteration</b> - if true, do not append iteration number to docname
* </ul>
*/
public class TrecContentSource extends ContentSource {
@@ -91,6 +92,7 @@
BufferedReader reader;
int iteration = 0;
HTMLParser htmlParser;
+ private boolean excludeDocnameIteration;
private DateFormatInfo getDateFormatInfo() {
DateFormatInfo dfi = dateFormats.get();
@@ -256,7 +258,8 @@
read(docBuf, DOCNO, true, false, null);
name = docBuf.substring(DOCNO.length(), docBuf.indexOf(TERMINATING_DOCNO,
DOCNO.length()));
- name = name + "_" + iteration;
+ if (!excludeDocnameIteration)
+ name = name + "_" + iteration;
// 3. skip until doc header
docBuf.setLength(0);
@@ -342,6 +345,7 @@
if (encoding == null) {
encoding = "ISO-8859-1";
}
+ excludeDocnameIteration = config.get("content.source.excludeIteration", false);
}
}