You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by mi...@apache.org on 2009/06/23 18:46:17 UTC

svn commit: r787750 - in /lucene/java/trunk/contrib/benchmark: CHANGES.txt src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java src/test/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTaskTest.java

Author: mikemccand
Date: Tue Jun 23 16:46:17 2009
New Revision: 787750

URL: http://svn.apache.org/viewvc?rev=787750&view=rev
Log:
LUCENE-1714: fix WriteLineDocTask to also replace \r, \n (in addition to \t) with space so those chars don't create mal-formed lines

Modified:
    lucene/java/trunk/contrib/benchmark/CHANGES.txt
    lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java
    lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTaskTest.java

Modified: lucene/java/trunk/contrib/benchmark/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/CHANGES.txt?rev=787750&r1=787749&r2=787750&view=diff
==============================================================================
--- lucene/java/trunk/contrib/benchmark/CHANGES.txt (original)
+++ lucene/java/trunk/contrib/benchmark/CHANGES.txt Tue Jun 23 16:46:17 2009
@@ -3,6 +3,11 @@
 The Benchmark contrib package contains code for benchmarking Lucene in a variety of ways.
 
 $Id:$
+6/23/09
+  LUCENE-1714: WriteLineDocTask incorrectly  normalized text, by replacing only 
+  occurrences of "\t" with a space. It now replaces "\r\n" in addition to that, 
+  so that LineDocMaker won't fail. (Shai Erera via Michael McCandless)
+  
 6/17/09 
   LUCENE-1595: This issue breaks previous external algorithms. DocMaker has been 
   replaced with a concrete class which accepts a ContentSource for iterating over 

Modified: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java?rev=787750&r1=787749&r2=787750&view=diff
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java (original)
+++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java Tue Jun 23 16:46:17 2009
@@ -22,6 +22,8 @@
 import java.io.FileOutputStream;
 import java.io.OutputStream;
 import java.io.OutputStreamWriter;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 
 import org.apache.commons.compress.compressors.CompressorStreamFactory;
 import org.apache.lucene.benchmark.byTask.PerfRunData;
@@ -36,7 +38,6 @@
  * taske can be consumed by
  * {@link org.apache.lucene.benchmark.byTask.feeds.LineDocMaker} and is intended
  * to save the IO overhead of opening a file per doument to be indexed.<br>
- * 
  * Supports the following parameters:
  * <ul>
  * <li>line.file.out - the name of the file to write the output to. That
@@ -45,10 +46,14 @@
  * recommended when the output file is expected to be large. (optional, default:
  * false).
  * </ul>
+ * <b>NOTE:</b> this class is not thread-safe and if used by multiple threads the
+ * output is unspecified (as all will write to the same ouput file in a
+ * non-synchronized way).
  */
 public class WriteLineDocTask extends PerfTask {
 
   public final static char SEP = '\t';
+  private static final Matcher NORMALIZER = Pattern.compile("[\t\r\n]+").matcher("");
 
   private int docSize = 0;
   private BufferedWriter lineFileOut = null;
@@ -92,14 +97,14 @@
     Document doc = docSize > 0 ? docMaker.makeDocument(docSize) : docMaker.makeDocument();
 
     Field f = doc.getField(DocMaker.BODY_FIELD);
-    String body = f != null ? f.stringValue().replace('\t', ' ') : null;
+    String body = f != null ? NORMALIZER.reset(f.stringValue()).replaceAll(" ") : null;
     
     if (body != null) {
       f = doc.getField(DocMaker.TITLE_FIELD);
-      String title = f != null ? f.stringValue().replace('\t', ' ') : "";
+      String title = f != null ? NORMALIZER.reset(f.stringValue()).replaceAll(" ") : "";
       
       f = doc.getField(DocMaker.DATE_FIELD);
-      String date = f != null ? f.stringValue().replace('\t', ' ') : "";
+      String date = f != null ? NORMALIZER.reset(f.stringValue()).replaceAll(" ") : "";
       
       lineFileOut.write(title, 0, title.length());
       lineFileOut.write(SEP);

Modified: lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTaskTest.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTaskTest.java?rev=787750&r1=787749&r2=787750&view=diff
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTaskTest.java (original)
+++ lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTaskTest.java Tue Jun 23 16:46:17 2009
@@ -27,9 +27,7 @@
 import org.apache.commons.compress.compressors.CompressorStreamFactory;
 import org.apache.lucene.benchmark.BenchmarkTestCase;
 import org.apache.lucene.benchmark.byTask.PerfRunData;
-import org.apache.lucene.benchmark.byTask.feeds.DocData;
 import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
-import org.apache.lucene.benchmark.byTask.feeds.NoMoreDataException;
 import org.apache.lucene.benchmark.byTask.utils.Config;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
@@ -42,10 +40,6 @@
   // class has to be public so that Class.forName.newInstance() will work
   public static final class WriteLineDocMaker extends DocMaker {
 
-    protected DocData getNextDocData() throws NoMoreDataException, Exception {
-      throw new UnsupportedOperationException("not implemented");
-    }
-
     public Document makeDocument() throws Exception {
       Document doc = new Document();
       doc.add(new Field(BODY_FIELD, "body", Store.NO, Index.NOT_ANALYZED_NO_NORMS));
@@ -54,17 +48,28 @@
       return doc;
     }
     
-    public int numUniqueTexts() {
-      return 0;
+  }
+  
+  // class has to be public so that Class.forName.newInstance() will work
+  public static final class NewLinesDocMaker extends DocMaker {
+    
+    public Document makeDocument() throws Exception {
+      Document doc = new Document();
+      doc.add(new Field(BODY_FIELD, "body\r\ntext\ttwo", Store.NO, Index.NOT_ANALYZED_NO_NORMS));
+      doc.add(new Field(TITLE_FIELD, "title\r\ntext", Store.NO, Index.NOT_ANALYZED_NO_NORMS));
+      doc.add(new Field(DATE_FIELD, "date\r\ntext", Store.NO, Index.NOT_ANALYZED_NO_NORMS));
+      return doc;
     }
     
   }
   
   private static final CompressorStreamFactory csFactory = new CompressorStreamFactory();
 
-  private PerfRunData createPerfRunData(File file, boolean setBZCompress, String bz2CompressVal) throws Exception {
+  private PerfRunData createPerfRunData(File file, boolean setBZCompress,
+                                        String bz2CompressVal,
+                                        String docMakerName) throws Exception {
     Properties props = new Properties();
-    props.setProperty("doc.maker", WriteLineDocMaker.class.getName());
+    props.setProperty("doc.maker", docMakerName);
     props.setProperty("line.file.out", file.getAbsolutePath());
     if (setBZCompress) {
       props.setProperty("bzip.compression", bz2CompressVal);
@@ -74,7 +79,8 @@
     return new PerfRunData(config);
   }
   
-  private void doReadTest(File file, boolean bz2File) throws Exception {
+  private void doReadTest(File file, boolean bz2File, String expTitle,
+                          String expDate, String expBody) throws Exception {
     InputStream in = new FileInputStream(file);
     if (bz2File) {
       in = csFactory.createCompressorInputStream("bzip2", in);
@@ -85,9 +91,9 @@
       assertNotNull(line);
       String[] parts = line.split(Character.toString(WriteLineDocTask.SEP));
       assertEquals(3, parts.length);
-      assertEquals("title", parts[0]);
-      assertEquals("date", parts[1]);
-      assertEquals("body", parts[2]);
+      assertEquals(expTitle, parts[0]);
+      assertEquals(expDate, parts[1]);
+      assertEquals(expBody, parts[2]);
       assertNull(br.readLine());
     } finally {
       br.close();
@@ -99,36 +105,48 @@
     
     // Create a document in bz2 format.
     File file = new File(getWorkDir(), "one-line.bz2");
-    PerfRunData runData = createPerfRunData(file, true, "true");
+    PerfRunData runData = createPerfRunData(file, true, "true", WriteLineDocMaker.class.getName());
     WriteLineDocTask wldt = new WriteLineDocTask(runData);
     wldt.doLogic();
     wldt.close();
     
-    doReadTest(file, true);
+    doReadTest(file, true, "title", "date", "body");
   }
   
   public void testBZip2AutoDetect() throws Exception {
     
     // Create a document in bz2 format.
     File file = new File(getWorkDir(), "one-line.bz2");
-    PerfRunData runData = createPerfRunData(file, false, null);
+    PerfRunData runData = createPerfRunData(file, false, null, WriteLineDocMaker.class.getName());
     WriteLineDocTask wldt = new WriteLineDocTask(runData);
     wldt.doLogic();
     wldt.close();
     
-    doReadTest(file, true);
+    doReadTest(file, true, "title", "date", "body");
   }
   
   public void testRegularFile() throws Exception {
     
     // Create a document in regular format.
     File file = new File(getWorkDir(), "one-line");
-    PerfRunData runData = createPerfRunData(file, true, "false");
+    PerfRunData runData = createPerfRunData(file, true, "false", WriteLineDocMaker.class.getName());
     WriteLineDocTask wldt = new WriteLineDocTask(runData);
     wldt.doLogic();
     wldt.close();
     
-    doReadTest(file, false);
+    doReadTest(file, false, "title", "date", "body");
+  }
+
+  public void testCharsReplace() throws Exception {
+    // WriteLineDocTask replaced only \t characters w/ a space, since that's its
+    // separator char. However, it didn't replace newline characters, which
+    // resulted in errors in LineDocMaker.
+    File file = new File(getWorkDir(), "one-line");
+    PerfRunData runData = createPerfRunData(file, false, null, NewLinesDocMaker.class.getName());
+    WriteLineDocTask wldt = new WriteLineDocTask(runData);
+    wldt.doLogic();
+    wldt.close();
+    
+    doReadTest(file, false, "title text", "date text", "body text two");
   }
-  
 }