You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by rm...@apache.org on 2009/11/13 01:47:16 UTC

svn commit: r835677 - in /lucene/java/trunk/contrib/benchmark: CHANGES.txt src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java

Author: rmuir
Date: Fri Nov 13 00:47:15 2009
New Revision: 835677

URL: http://svn.apache.org/viewvc?rev=835677&view=rev
Log:
LUCENE-2059: allow TrecContentSource not to change the docname

Modified:
    lucene/java/trunk/contrib/benchmark/CHANGES.txt
    lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java

Modified: lucene/java/trunk/contrib/benchmark/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/CHANGES.txt?rev=835677&r1=835676&r2=835677&view=diff
==============================================================================
--- lucene/java/trunk/contrib/benchmark/CHANGES.txt (original)
+++ lucene/java/trunk/contrib/benchmark/CHANGES.txt Fri Nov 13 00:47:15 2009
@@ -5,6 +5,13 @@
 $Id:$
 
 11/12/2009
+  LUCENE-2059: allow TrecContentSource not to change the docname.
+  Previously, it would always append the iteration # to the docname.
+  With the new option content.source.excludeIteration, you can disable this.
+  The resulting index can then be used with the quality package to measure
+  relevance. (Robert Muir)
+  
+11/12/2009
   LUCENE-2058: specify trec_eval submission output from the command line.
   Previously, 4 arguments were required, but the third was unused. The 
   third argument is now the desired location of submission.txt  (Robert Muir)

Modified: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java?rev=835677&r1=835676&r2=835677&view=diff
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java (original)
+++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java Fri Nov 13 00:47:15 2009
@@ -48,6 +48,7 @@
  * <li><b>html.parser</b> - specifies the {@link HTMLParser} class to use for
  * parsing the TREC documents content (<b>default=DemoHTMLParser</b>).
  * <li><b>content.source.encoding</b> - if not specified, ISO-8859-1 is used.
+ * <li><b>content.source.excludeIteration</b> - if true, do not append iteration number to docname
  * </ul>
  */
 public class TrecContentSource extends ContentSource {
@@ -91,6 +92,7 @@
   BufferedReader reader;
   int iteration = 0;
   HTMLParser htmlParser;
+  private boolean excludeDocnameIteration;
   
   private DateFormatInfo getDateFormatInfo() {
     DateFormatInfo dfi = dateFormats.get();
@@ -256,7 +258,8 @@
       read(docBuf, DOCNO, true, false, null);
       name = docBuf.substring(DOCNO.length(), docBuf.indexOf(TERMINATING_DOCNO,
           DOCNO.length()));
-      name = name + "_" + iteration;
+      if (!excludeDocnameIteration)
+        name = name + "_" + iteration;
 
       // 3. skip until doc header
       docBuf.setLength(0);
@@ -342,6 +345,7 @@
     if (encoding == null) {
       encoding = "ISO-8859-1";
     }
+    excludeDocnameIteration = config.get("content.source.excludeIteration", false);
   }
 
 }