You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by do...@apache.org on 2011/03/21 15:59:43 UTC
svn commit: r1083816 - in /lucene/dev/trunk: ./ lucene/ modules/benchmark/
modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/
modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/
modules/benchmark/src/test/org/apache/l...
Author: doronc
Date: Mon Mar 21 14:59:42 2011
New Revision: 1083816
URL: http://svn.apache.org/viewvc?rev=1083816&view=rev
Log:
LUCENE-2958: WriteLineDocTask improvements - flexible line fields definition - port/merge from 3x.
Modified:
lucene/dev/trunk/ (props changed)
lucene/dev/trunk/lucene/ (props changed)
lucene/dev/trunk/modules/benchmark/CHANGES.txt
lucene/dev/trunk/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocSource.java
lucene/dev/trunk/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java
lucene/dev/trunk/modules/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java
lucene/dev/trunk/modules/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/LineDocSourceTest.java
lucene/dev/trunk/modules/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTaskTest.java
Modified: lucene/dev/trunk/modules/benchmark/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/benchmark/CHANGES.txt?rev=1083816&r1=1083815&r2=1083816&view=diff
==============================================================================
--- lucene/dev/trunk/modules/benchmark/CHANGES.txt (original)
+++ lucene/dev/trunk/modules/benchmark/CHANGES.txt Mon Mar 21 14:59:42 2011
@@ -3,6 +3,13 @@ Lucene Benchmark Contrib Change Log
The Benchmark contrib package contains code for benchmarking Lucene in a variety of ways.
03/21/2011
+ LUCENE-2958: WriteLineDocTask improvements - allow to emit line docs also for empty
+ docs, and be flexible about which fields are added to the line file. For this, a header
+ line was added to the line file. That header is examined by LineDocSource. Old line
+ files which have no header line are handled as before, imposing the default header.
+ (Doron Cohen, Shai Erera, Mike McCandless)
+
+03/21/2011
LUCENE-2964: Allow benchmark tasks from alternative packages,
specified through a new property "alt.tasks.packages".
(Doron Cohen, Shai Erera)
Modified: lucene/dev/trunk/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocSource.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocSource.java?rev=1083816&r1=1083815&r2=1083816&view=diff
==============================================================================
--- lucene/dev/trunk/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocSource.java (original)
+++ lucene/dev/trunk/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocSource.java Mon Mar 21 14:59:42 2011
@@ -22,6 +22,9 @@ import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
+import java.lang.reflect.Constructor;
+import java.util.Arrays;
+import java.util.Properties;
import org.apache.lucene.benchmark.byTask.tasks.WriteLineDocTask;
import org.apache.lucene.benchmark.byTask.utils.Config;
@@ -40,16 +43,136 @@ import org.apache.lucene.benchmark.byTas
* <ul>
* <li>docs.file=<path to the file>
* <li>content.source.encoding - default to UTF-8.
+ * <li>line.parser - default to {@link HeaderLineParser} if a header line exists which differs
+ * from {@link WriteLineDocTask#DEFAULT_FIELDS} and to {@link SimpleLineParser} otherwise.
* </ul>
*/
public class LineDocSource extends ContentSource {
- private final static char SEP = WriteLineDocTask.SEP;
+ /** Reader of a single input line into {@link DocData}. */
+ public static abstract class LineParser {
+ protected final String[] header;
+ /** Construct with the header
+ * @param header header line found in the input file, or null if none
+ */
+ public LineParser(String[] header) {
+ this.header = header;
+ }
+ /** parse an input line and fill doc data appropriately */
+ public abstract void parseLine(DocData docData, String line);
+ }
+
+ /**
+ * {@link LineParser} which ignores the header passed to its constructor
+ * and assumes simply that field names and their order are the same
+ * as in {@link WriteLineDocTask#DEFAULT_FIELDS}
+ */
+ public static class SimpleLineParser extends LineParser {
+ public SimpleLineParser(String[] header) {
+ super(header);
+ }
+ public void parseLine(DocData docData, String line) {
+ int k1 = 0;
+ int k2 = line.indexOf(WriteLineDocTask.SEP, k1);
+ if (k2<0) {
+ throw new RuntimeException("line: [" + line + "] is in an invalid format (missing: separator title::date)!");
+ }
+ docData.setTitle(line.substring(k1,k2));
+ k1 = k2+1;
+ k2 = line.indexOf(WriteLineDocTask.SEP, k1);
+ if (k2<0) {
+ throw new RuntimeException("line: [" + line + "] is in an invalid format (missing: separator date::body)!");
+ }
+ docData.setDate(line.substring(k1,k2));
+ k1 = k2+1;
+ k2 = line.indexOf(WriteLineDocTask.SEP, k1);
+ if (k2>=0) {
+ throw new RuntimeException("line: [" + line + "] is in an invalid format (too many separators)!");
+ }
+ // last one
+ docData.setBody(line.substring(k1));
+ }
+ }
+
+ /**
+ * {@link LineParser} which sets field names and order by
+ * the header - any header - of the lines file.
+ * It is less efficient than {@link SimpleLineParser} but more powerful.
+ */
+ public static class HeaderLineParser extends LineParser {
+ private enum FieldName { NAME , TITLE , DATE , BODY, PROP }
+ private final FieldName[] posToF;
+ public HeaderLineParser(String[] header) {
+ super(header);
+ posToF = new FieldName[header.length];
+ for (int i=0; i<header.length; i++) {
+ String f = header[i];
+ if (DocMaker.NAME_FIELD.equals(f)) {
+ posToF[i] = FieldName.NAME;
+ } else if (DocMaker.TITLE_FIELD.equals(f)) {
+ posToF[i] = FieldName.TITLE;
+ } else if (DocMaker.DATE_FIELD.equals(f)) {
+ posToF[i] = FieldName.DATE;
+ } else if (DocMaker.BODY_FIELD.equals(f)) {
+ posToF[i] = FieldName.BODY;
+ } else {
+ posToF[i] = FieldName.PROP;
+ }
+ }
+ }
+
+ public void parseLine(DocData docData, String line) {
+ int n = 0;
+ int k1 = 0;
+ int k2;
+ while ((k2 = line.indexOf(WriteLineDocTask.SEP, k1)) >= 0) {
+ if (n>=header.length) {
+ throw new RuntimeException("input line has invalid format: "+(n+1)+" fields instead of "+header.length+" :: [" + line + "]");
+ }
+ setDocDataField(docData, n, line.substring(k1,k2));
+ ++n;
+ k1 = k2 + 1;
+ }
+ if (n!=header.length-1) {
+ throw new RuntimeException("input line has invalid format: "+(n+1)+" fields instead of "+header.length+" :: [" + line + "]");
+ }
+ // last one
+ setDocDataField(docData, n, line.substring(k1));
+ }
+ private void setDocDataField(DocData docData, int position, String text) {
+ switch(posToF[position]) {
+ case NAME:
+ docData.setName(text);
+ break;
+ case TITLE:
+ docData.setTitle(text);
+ break;
+ case DATE:
+ docData.setDate(text);
+ break;
+ case BODY:
+ docData.setBody(text);
+ break;
+ case PROP:
+ Properties p = docData.getProps();
+ if (p==null) {
+ p = new Properties();
+ docData.setProps(p);
+ }
+ p.setProperty(header[position], text);
+ break;
+ }
+ }
+ }
+
private File file;
private BufferedReader reader;
private int readCount;
+ private LineParser docDataLineReader = null;
+ private boolean skipHeaderLine = false;
+
private synchronized void openFile() {
try {
if (reader != null) {
@@ -57,6 +180,9 @@ public class LineDocSource extends Conte
}
InputStream is = getInputStream(file);
reader = new BufferedReader(new InputStreamReader(is, encoding), BUFFER_SIZE);
+ if (skipHeaderLine) {
+ reader.readLine(); // skip one line - the header line - already handled that info
+ }
} catch (IOException e) {
throw new RuntimeException(e);
}
@@ -77,7 +203,6 @@ public class LineDocSource extends Conte
synchronized(this) {
line = reader.readLine();
- myID = readCount++;
if (line == null) {
if (!forever) {
throw new NoMoreDataException();
@@ -86,27 +211,54 @@ public class LineDocSource extends Conte
openFile();
return getNextDocData(docData);
}
+ if (docDataLineReader == null) { // first line ever, one time initialization,
+ docDataLineReader = createDocDataLineReader(line);
+ if (skipHeaderLine) {
+ return getNextDocData(docData);
+ }
+ }
+ // increment IDS only once...
+ myID = readCount++;
}
- // A line must be in the following format. If it's not, fail !
- // title <TAB> date <TAB> body <NEWLINE>
- int spot = line.indexOf(SEP);
- if (spot == -1) {
- throw new RuntimeException("line: [" + line + "] is in an invalid format !");
- }
- int spot2 = line.indexOf(SEP, 1 + spot);
- if (spot2 == -1) {
- throw new RuntimeException("line: [" + line + "] is in an invalid format !");
- }
// The date String was written in the format of DateTools.dateToString.
docData.clear();
docData.setID(myID);
- docData.setBody(line.substring(1 + spot2, line.length()));
- docData.setTitle(line.substring(0, spot));
- docData.setDate(line.substring(1 + spot, spot2));
+ docDataLineReader.parseLine(docData, line);
return docData;
}
+ private LineParser createDocDataLineReader(String line) {
+ String[] header;
+ String headIndicator = WriteLineDocTask.FIELDS_HEADER_INDICATOR + WriteLineDocTask.SEP;
+
+ if (line.startsWith(headIndicator)) {
+ header = line.substring(headIndicator.length()).split(Character.toString(WriteLineDocTask.SEP));
+ skipHeaderLine = true; // mark to skip the header line when input file is reopened
+ } else {
+ header = WriteLineDocTask.DEFAULT_FIELDS;
+ }
+
+ // if a specific DocDataLineReader was configured, must respect it
+ String docDataLineReaderClassName = getConfig().get("line.parser", null);
+ if (docDataLineReaderClassName!=null) {
+ try {
+ final Class<? extends LineParser> clazz =
+ Class.forName(docDataLineReaderClassName).asSubclass(LineParser.class);
+ Constructor<? extends LineParser> cnstr = clazz.getConstructor(new Class[]{String[].class});
+ return cnstr.newInstance((Object)header);
+ } catch (Exception e) {
+ throw new RuntimeException("Failed to instantiate "+docDataLineReaderClassName, e);
+ }
+ }
+
+ // if this the simple case,
+ if (Arrays.deepEquals(header, WriteLineDocTask.DEFAULT_FIELDS)) {
+ return new SimpleLineParser(header);
+ }
+ return new HeaderLineParser(header);
+ }
+
@Override
public void resetInputs() throws IOException {
super.resetInputs();
Modified: lucene/dev/trunk/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java?rev=1083816&r1=1083815&r2=1083816&view=diff
==============================================================================
--- lucene/dev/trunk/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java (original)
+++ lucene/dev/trunk/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java Mon Mar 21 14:59:42 2011
@@ -23,6 +23,8 @@ import java.io.FileOutputStream;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
+import java.util.Arrays;
+import java.util.HashSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@@ -41,11 +43,17 @@ import org.apache.lucene.document.Field;
* to save the IO overhead of opening a file per document to be indexed.<br>
* Supports the following parameters:
* <ul>
- * <li>line.file.out - the name of the file to write the output to. That
+ * <li><b>line.file.out<b> - the name of the file to write the output to. That
* parameter is mandatory. <b>NOTE:</b> the file is re-created.
- * <li>bzip.compression - whether the output should be bzip-compressed. This is
- * recommended when the output file is expected to be large. (optional, default:
- * false).
+ * <li><b>bzip.compression<b> - whether the output should be bzip-compressed. This is
+ * recommended when the output file is expected to be large.
+ * <li><b>line.fields<b> - which fields should be written in each line.
+ * (optional, default: {@link #DEFAULT_FIELDS}).
+ * <li><b>sufficient.fields</b> - list of field names, separated by comma, which,
+ * if all of them are missing, the document will be skipped. For example, to require
+ * that at least one of f1,f2 is not empty, specify: "f1,f2" in this field. To specify
+ * that no field is required, i.e. that even empty docs should be emitted, specify <b>","</b>.
+ * (optional, default: {@link #DEFAULT_SUFFICIENT_FIELDS}).
* </ul>
* <b>NOTE:</b> this class is not thread-safe and if used by multiple threads the
* output is unspecified (as all will write to the same output file in a
@@ -53,13 +61,32 @@ import org.apache.lucene.document.Field;
*/
public class WriteLineDocTask extends PerfTask {
+ public static final String FIELDS_HEADER_INDICATOR = "FIELDS_HEADER_INDICATOR###";
+
public final static char SEP = '\t';
+ /**
+ * Fields to be written by default
+ */
+ public static final String[] DEFAULT_FIELDS = new String[] {
+ DocMaker.TITLE_FIELD,
+ DocMaker.DATE_FIELD,
+ DocMaker.BODY_FIELD,
+ };
+
+ /**
+ * Default fields which at least one of them is required to not skip the doc.
+ */
+ public static final String DEFAULT_SUFFICIENT_FIELDS = DocMaker.TITLE_FIELD +',' + DocMaker.BODY_FIELD;
+
private int docSize = 0;
private PrintWriter lineFileOut = null;
private DocMaker docMaker;
private ThreadLocal<StringBuilder> threadBuffer = new ThreadLocal<StringBuilder>();
private ThreadLocal<Matcher> threadNormalizer = new ThreadLocal<Matcher>();
+ private final String[] fieldsToWrite;;
+ private final boolean[] sufficientFields;
+ private final boolean checkSufficientFields;
public WriteLineDocTask(PerfRunData runData) throws Exception {
super(runData);
@@ -89,6 +116,51 @@ public class WriteLineDocTask extends Pe
}
lineFileOut = new PrintWriter(new BufferedWriter(new OutputStreamWriter(out, "UTF-8"), 1 << 16));
docMaker = runData.getDocMaker();
+
+ // init fields
+ String f2r = config.get("line.fields",null);
+ if (f2r == null) {
+ fieldsToWrite = DEFAULT_FIELDS;
+ } else {
+ if (f2r.indexOf(SEP)>=0) {
+ throw new IllegalArgumentException("line.fields "+f2r+" should not contain the separator char: "+SEP);
+ }
+ fieldsToWrite = f2r.split(",");
+ }
+
+ // init sufficient fields
+ sufficientFields = new boolean[fieldsToWrite.length];
+ String suff = config.get("sufficient.fields",DEFAULT_SUFFICIENT_FIELDS);
+ if (",".equals(suff)) {
+ checkSufficientFields = false;
+ } else {
+ checkSufficientFields = true;
+ HashSet<String> sf = new HashSet<String>(Arrays.asList(suff.split(",")));
+ for (int i=0; i<fieldsToWrite.length; i++) {
+ if (sf.contains(fieldsToWrite[i])) {
+ sufficientFields[i] = true;
+ }
+ }
+ }
+
+ writeHeader();
+ }
+
+ /**
+ * Write a header to the lines file - indicating how to read the file later
+ */
+ private void writeHeader() {
+ StringBuilder sb = threadBuffer.get();
+ if (sb == null) {
+ sb = new StringBuilder();
+ threadBuffer.set(sb);
+ }
+ sb.setLength(0);
+ sb.append(FIELDS_HEADER_INDICATOR);
+ for (String f : fieldsToWrite) {
+ sb.append(SEP).append(f);
+ }
+ lineFileOut.println(sb.toString());
}
@Override
@@ -106,27 +178,26 @@ public class WriteLineDocTask extends Pe
threadNormalizer.set(matcher);
}
- Field f = doc.getField(DocMaker.BODY_FIELD);
- String body = f != null ? matcher.reset(f.stringValue()).replaceAll(" ") : "";
-
- f = doc.getField(DocMaker.TITLE_FIELD);
- String title = f != null ? matcher.reset(f.stringValue()).replaceAll(" ") : "";
-
- if (body.length() > 0 || title.length() > 0) {
-
- f = doc.getField(DocMaker.DATE_FIELD);
- String date = f != null ? matcher.reset(f.stringValue()).replaceAll(" ") : "";
-
- StringBuilder sb = threadBuffer.get();
- if (sb == null) {
- sb = new StringBuilder();
- threadBuffer.set(sb);
- }
- sb.setLength(0);
- sb.append(title).append(SEP).append(date).append(SEP).append(body);
+ StringBuilder sb = threadBuffer.get();
+ if (sb == null) {
+ sb = new StringBuilder();
+ threadBuffer.set(sb);
+ }
+ sb.setLength(0);
+
+ boolean sufficient = !checkSufficientFields;
+ for (int i=0; i<fieldsToWrite.length; i++) {
+ Field f = doc.getField(fieldsToWrite[i]);
+ String text = f == null ? "" : matcher.reset(f.stringValue()).replaceAll(" ").trim();
+ sb.append(text).append(SEP);
+ sufficient |= text.length()>0 && sufficientFields[i];
+ }
+ if (sufficient) {
+ sb.setLength(sb.length()-1); // remove redundant last separator
// lineFileOut is a PrintWriter, which synchronizes internally in println.
lineFileOut.println(sb.toString());
}
+
return 1;
}
Modified: lucene/dev/trunk/modules/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java?rev=1083816&r1=1083815&r2=1083816&view=diff
==============================================================================
--- lucene/dev/trunk/modules/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java (original)
+++ lucene/dev/trunk/modules/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java Mon Mar 21 14:59:42 2011
@@ -36,6 +36,7 @@ import org.apache.lucene.benchmark.byTas
import org.apache.lucene.benchmark.byTask.stats.TaskStats;
import org.apache.lucene.benchmark.byTask.tasks.CountingHighlighterTestTask;
import org.apache.lucene.benchmark.byTask.tasks.CountingSearchTestTask;
+import org.apache.lucene.benchmark.byTask.tasks.WriteLineDocTask;
import org.apache.lucene.collation.CollationKeyAnalyzer;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.FieldsEnum;
@@ -393,8 +394,13 @@ public class TestPerfTasksLogic extends
BufferedReader r = new BufferedReader(new FileReader(lineFile));
int numLines = 0;
- while(r.readLine() != null)
+ String line;
+ while((line = r.readLine()) != null) {
+ if (numLines==0 && line.startsWith(WriteLineDocTask.FIELDS_HEADER_INDICATOR)) {
+ continue; // do not count the header line as a doc
+ }
numLines++;
+ }
r.close();
assertEquals("did not see the right number of docs; should be " + NUM_TRY_DOCS + " but was " + numLines, NUM_TRY_DOCS, numLines);
Modified: lucene/dev/trunk/modules/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/LineDocSourceTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/LineDocSourceTest.java?rev=1083816&r1=1083815&r2=1083816&view=diff
==============================================================================
--- lucene/dev/trunk/modules/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/LineDocSourceTest.java (original)
+++ lucene/dev/trunk/modules/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/LineDocSourceTest.java Mon Mar 21 14:59:42 2011
@@ -20,6 +20,7 @@ package org.apache.lucene.benchmark.byTa
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
+import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.util.Properties;
@@ -28,6 +29,8 @@ import org.apache.commons.compress.compr
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.benchmark.BenchmarkTestCase;
import org.apache.lucene.benchmark.byTask.PerfRunData;
+import org.apache.lucene.benchmark.byTask.feeds.LineDocSource.HeaderLineParser;
+import org.apache.lucene.benchmark.byTask.feeds.LineDocSource.LineParser;
import org.apache.lucene.benchmark.byTask.tasks.AddDocTask;
import org.apache.lucene.benchmark.byTask.tasks.CloseIndexTask;
import org.apache.lucene.benchmark.byTask.tasks.CreateIndexTask;
@@ -44,29 +47,71 @@ public class LineDocSourceTest extends B
private static final CompressorStreamFactory csFactory = new CompressorStreamFactory();
- private void createBZ2LineFile(File file) throws Exception {
+ private void createBZ2LineFile(File file, boolean addHeader) throws Exception {
OutputStream out = new FileOutputStream(file);
out = csFactory.createCompressorOutputStream("bzip2", out);
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out, "utf-8"));
+ writeDocsToFile(writer, addHeader, null);
+ writer.close();
+ }
+
+ private void writeDocsToFile(BufferedWriter writer, boolean addHeader, Properties otherFields) throws IOException {
+ if (addHeader) {
+ writer.write(WriteLineDocTask.FIELDS_HEADER_INDICATOR);
+ writer.write(WriteLineDocTask.SEP);
+ writer.write(DocMaker.TITLE_FIELD);
+ writer.write(WriteLineDocTask.SEP);
+ writer.write(DocMaker.DATE_FIELD);
+ writer.write(WriteLineDocTask.SEP);
+ writer.write(DocMaker.BODY_FIELD);
+ if (otherFields!=null) {
+ // additional field names in the header
+ for (Object fn : otherFields.keySet()) {
+ writer.write(WriteLineDocTask.SEP);
+ writer.write(fn.toString());
+ }
+ }
+ writer.newLine();
+ }
StringBuilder doc = new StringBuilder();
- doc.append("title").append(WriteLineDocTask.SEP).append("date").append(WriteLineDocTask.SEP).append("body");
+ doc.append("title").append(WriteLineDocTask.SEP).append("date").append(WriteLineDocTask.SEP).append(DocMaker.BODY_FIELD);
+ if (otherFields!=null) {
+ // additional field values in the doc line
+ for (Object fv : otherFields.values()) {
+ doc.append(WriteLineDocTask.SEP).append(fv.toString());
+ }
+ }
writer.write(doc.toString());
writer.newLine();
+ }
+
+ private void createRegularLineFile(File file, boolean addHeader) throws Exception {
+ OutputStream out = new FileOutputStream(file);
+ BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out, "utf-8"));
+ writeDocsToFile(writer, addHeader, null);
writer.close();
}
- private void createRegularLineFile(File file) throws Exception {
+ private void createRegularLineFileWithMoreFields(File file, String...extraFields) throws Exception {
OutputStream out = new FileOutputStream(file);
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out, "utf-8"));
- StringBuilder doc = new StringBuilder();
- doc.append("title").append(WriteLineDocTask.SEP).append("date").append(WriteLineDocTask.SEP).append("body");
- writer.write(doc.toString());
- writer.newLine();
+ Properties p = new Properties();
+ for (String f : extraFields) {
+ p.setProperty(f, f);
+ }
+ writeDocsToFile(writer, true, p);
writer.close();
}
private void doIndexAndSearchTest(File file, boolean setBZCompress,
- String bz2CompressVal) throws Exception {
+ String bz2CompressVal, Class<? extends LineParser> lineParserClass, String storedField) throws Exception {
+ doIndexAndSearchTestWithRepeats(file, setBZCompress, bz2CompressVal, lineParserClass, 1, storedField); // no extra repetitions
+ doIndexAndSearchTestWithRepeats(file, setBZCompress, bz2CompressVal, lineParserClass, 2, storedField); // 1 extra repetition
+ doIndexAndSearchTestWithRepeats(file, setBZCompress, bz2CompressVal, lineParserClass, 4, storedField); // 3 extra repetitions
+ }
+
+ private void doIndexAndSearchTestWithRepeats(File file, boolean setBZCompress,
+ String bz2CompressVal, Class<? extends LineParser> lineParserClass, int numAdds, String storedField) throws Exception {
Properties props = new Properties();
@@ -75,11 +120,16 @@ public class LineDocSourceTest extends B
if (setBZCompress) {
props.setProperty("bzip.compression", bz2CompressVal);
}
+ if (lineParserClass != null) {
+ props.setProperty("line.parser", lineParserClass.getName());
+ }
// Indexing configuration.
props.setProperty("analyzer", MockAnalyzer.class.getName());
props.setProperty("content.source", LineDocSource.class.getName());
props.setProperty("directory", "RAMDirectory");
+ props.setProperty("doc.stored", "true");
+ props.setProperty("doc.index.props", "true");
// Create PerfRunData
Config config = new Config(props);
@@ -87,34 +137,60 @@ public class LineDocSourceTest extends B
TaskSequence tasks = new TaskSequence(runData, "testBzip2", null, false);
tasks.addTask(new CreateIndexTask(runData));
- tasks.addTask(new AddDocTask(runData));
+ for (int i=0; i<numAdds; i++) {
+ tasks.addTask(new AddDocTask(runData));
+ }
tasks.addTask(new CloseIndexTask(runData));
tasks.doLogic();
IndexSearcher searcher = new IndexSearcher(runData.getDirectory(), true);
TopDocs td = searcher.search(new TermQuery(new Term("body", "body")), 10);
- assertEquals(1, td.totalHits);
+ assertEquals(numAdds, td.totalHits);
assertNotNull(td.scoreDocs[0]);
+
+ if (storedField==null) {
+ storedField = DocMaker.BODY_FIELD; // added to all docs and satisfies field-name == value
+ }
+ assertEquals("Wrong field value", storedField, searcher.doc(0).get(storedField));
+
searcher.close();
}
/* Tests LineDocSource with a bzip2 input stream. */
public void testBZip2() throws Exception {
File file = new File(getWorkDir(), "one-line.bz2");
- createBZ2LineFile(file);
- doIndexAndSearchTest(file, true, "true");
+ createBZ2LineFile(file,true);
+ doIndexAndSearchTest(file, true, "true", null, null);
+ }
+
+ public void testBZip2NoHeaderLine() throws Exception {
+ File file = new File(getWorkDir(), "one-line.bz2");
+ createBZ2LineFile(file,false);
+ doIndexAndSearchTest(file, true, "true", null, null);
}
public void testBZip2AutoDetect() throws Exception {
File file = new File(getWorkDir(), "one-line.bz2");
- createBZ2LineFile(file);
- doIndexAndSearchTest(file, false, null);
+ createBZ2LineFile(file,false);
+ doIndexAndSearchTest(file, false, null, null, null);
}
public void testRegularFile() throws Exception {
File file = new File(getWorkDir(), "one-line");
- createRegularLineFile(file);
- doIndexAndSearchTest(file, false, null);
+ createRegularLineFile(file,true);
+ doIndexAndSearchTest(file, false, null, null, null);
+ }
+
+ public void testRegularFileSpecialHeader() throws Exception {
+ File file = new File(getWorkDir(), "one-line");
+ createRegularLineFile(file,true);
+ doIndexAndSearchTest(file, false, null, HeaderLineParser.class, null);
+ }
+
+ public void testRegularFileNoHeaderLine() throws Exception {
+ File file = new File(getWorkDir(), "one-line");
+ createRegularLineFile(file,false);
+ doIndexAndSearchTest(file, false, null, null, null);
}
public void testInvalidFormat() throws Exception {
@@ -134,7 +210,7 @@ public class LineDocSourceTest extends B
writer.newLine();
writer.close();
try {
- doIndexAndSearchTest(file, false, null);
+ doIndexAndSearchTest(file, false, null, null, null);
fail("Some exception should have been thrown for: [" + testCases[i] + "]");
} catch (Exception e) {
// expected.
@@ -142,4 +218,19 @@ public class LineDocSourceTest extends B
}
}
+ /** Doc Name is not part of the default header */
+ public void testWithDocsName() throws Exception {
+ File file = new File(getWorkDir(), "one-line");
+ createRegularLineFileWithMoreFields(file, DocMaker.NAME_FIELD);
+ doIndexAndSearchTest(file, false, null, null, DocMaker.NAME_FIELD);
+ }
+
+ /** Use fields names that are not defined in Docmaker and so will go to Properties */
+ public void testWithProperties() throws Exception {
+ File file = new File(getWorkDir(), "one-line");
+ String specialField = "mySpecialField";
+ createRegularLineFileWithMoreFields(file, specialField);
+ doIndexAndSearchTest(file, false, null, null, specialField);
+ }
+
}
Modified: lucene/dev/trunk/modules/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTaskTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTaskTest.java?rev=1083816&r1=1083815&r2=1083816&view=diff
==============================================================================
--- lucene/dev/trunk/modules/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTaskTest.java (original)
+++ lucene/dev/trunk/modules/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTaskTest.java Mon Mar 21 14:59:42 2011
@@ -98,6 +98,25 @@ public class WriteLineDocTaskTest extend
return doc;
}
}
+
+ // class has to be public so that Class.forName.newInstance() will work
+ // same as JustDate just that this one is treated as legal
+ public static final class LegalJustDateDocMaker extends DocMaker {
+ @Override
+ public Document makeDocument() throws Exception {
+ Document doc = new Document();
+ doc.add(new Field(DATE_FIELD, "date", Store.NO, Index.NOT_ANALYZED_NO_NORMS));
+ return doc;
+ }
+ }
+
+ // class has to be public so that Class.forName.newInstance() will work
+ public static final class EmptyDocMaker extends DocMaker {
+ @Override
+ public Document makeDocument() throws Exception {
+ return new Document();
+ }
+ }
// class has to be public so that Class.forName.newInstance() will work
public static final class ThreadingDocMaker extends DocMaker {
@@ -117,6 +136,7 @@ public class WriteLineDocTaskTest extend
private static final CompressorStreamFactory csFactory = new CompressorStreamFactory();
private PerfRunData createPerfRunData(File file, boolean setBZCompress,
+ boolean allowEmptyDocs,
String bz2CompressVal,
String docMakerName) throws Exception {
Properties props = new Properties();
@@ -126,6 +146,13 @@ public class WriteLineDocTaskTest extend
props.setProperty("bzip.compression", bz2CompressVal);
}
props.setProperty("directory", "RAMDirectory"); // no accidental FS dir.
+ if (allowEmptyDocs) {
+ props.setProperty("sufficient.fields", ",");
+ }
+ if (docMakerName.equals(LegalJustDateDocMaker.class.getName())) {
+ props.setProperty("line.fields", DocMaker.DATE_FIELD);
+ props.setProperty("sufficient.fields", DocMaker.DATE_FIELD);
+ }
Config config = new Config(props);
return new PerfRunData(config);
}
@@ -139,6 +166,8 @@ public class WriteLineDocTaskTest extend
BufferedReader br = new BufferedReader(new InputStreamReader(in, "utf-8"));
try {
String line = br.readLine();
+ assertHeaderLine(line);
+ line = br.readLine();
assertNotNull(line);
String[] parts = line.split(Character.toString(WriteLineDocTask.SEP));
int numExpParts = expBody == null ? 2 : 3;
@@ -153,13 +182,17 @@ public class WriteLineDocTaskTest extend
br.close();
}
}
+
+ private void assertHeaderLine(String line) {
+ assertTrue("First line should be a header line",line.startsWith(WriteLineDocTask.FIELDS_HEADER_INDICATOR));
+ }
/* Tests WriteLineDocTask with a bzip2 format. */
public void testBZip2() throws Exception {
// Create a document in bz2 format.
File file = new File(getWorkDir(), "one-line.bz2");
- PerfRunData runData = createPerfRunData(file, true, "true", WriteLineDocMaker.class.getName());
+ PerfRunData runData = createPerfRunData(file, true, false, "true", WriteLineDocMaker.class.getName());
WriteLineDocTask wldt = new WriteLineDocTask(runData);
wldt.doLogic();
wldt.close();
@@ -171,7 +204,7 @@ public class WriteLineDocTaskTest extend
// Create a document in bz2 format.
File file = new File(getWorkDir(), "one-line.bz2");
- PerfRunData runData = createPerfRunData(file, false, null, WriteLineDocMaker.class.getName());
+ PerfRunData runData = createPerfRunData(file, false, false, null, WriteLineDocMaker.class.getName());
WriteLineDocTask wldt = new WriteLineDocTask(runData);
wldt.doLogic();
wldt.close();
@@ -183,7 +216,7 @@ public class WriteLineDocTaskTest extend
// Create a document in regular format.
File file = new File(getWorkDir(), "one-line");
- PerfRunData runData = createPerfRunData(file, true, "false", WriteLineDocMaker.class.getName());
+ PerfRunData runData = createPerfRunData(file, true, false, "false", WriteLineDocMaker.class.getName());
WriteLineDocTask wldt = new WriteLineDocTask(runData);
wldt.doLogic();
wldt.close();
@@ -196,7 +229,7 @@ public class WriteLineDocTaskTest extend
// separator char. However, it didn't replace newline characters, which
// resulted in errors in LineDocSource.
File file = new File(getWorkDir(), "one-line");
- PerfRunData runData = createPerfRunData(file, false, null, NewLinesDocMaker.class.getName());
+ PerfRunData runData = createPerfRunData(file, false, false, null, NewLinesDocMaker.class.getName());
WriteLineDocTask wldt = new WriteLineDocTask(runData);
wldt.doLogic();
wldt.close();
@@ -209,7 +242,7 @@ public class WriteLineDocTaskTest extend
// had a TITLE element (LUCENE-1755). It should throw away documents if they
// don't have BODY nor TITLE
File file = new File(getWorkDir(), "one-line");
- PerfRunData runData = createPerfRunData(file, false, null, NoBodyDocMaker.class.getName());
+ PerfRunData runData = createPerfRunData(file, false, false, null, NoBodyDocMaker.class.getName());
WriteLineDocTask wldt = new WriteLineDocTask(runData);
wldt.doLogic();
wldt.close();
@@ -219,7 +252,7 @@ public class WriteLineDocTaskTest extend
public void testEmptyTitle() throws Exception {
File file = new File(getWorkDir(), "one-line");
- PerfRunData runData = createPerfRunData(file, false, null, NoTitleDocMaker.class.getName());
+ PerfRunData runData = createPerfRunData(file, false, false, null, NoTitleDocMaker.class.getName());
WriteLineDocTask wldt = new WriteLineDocTask(runData);
wldt.doLogic();
wldt.close();
@@ -227,9 +260,10 @@ public class WriteLineDocTaskTest extend
doReadTest(file, false, "", "date", "body");
}
+ /** Fail by default when there's only date */
public void testJustDate() throws Exception {
File file = new File(getWorkDir(), "one-line");
- PerfRunData runData = createPerfRunData(file, false, null, JustDateDocMaker.class.getName());
+ PerfRunData runData = createPerfRunData(file, false, false, null, JustDateDocMaker.class.getName());
WriteLineDocTask wldt = new WriteLineDocTask(runData);
wldt.doLogic();
wldt.close();
@@ -237,15 +271,53 @@ public class WriteLineDocTaskTest extend
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file), "utf-8"));
try {
String line = br.readLine();
+ assertHeaderLine(line);
+ line = br.readLine();
assertNull(line);
} finally {
br.close();
}
}
+ public void testLegalJustDate() throws Exception {
+ File file = new File(getWorkDir(), "one-line");
+ PerfRunData runData = createPerfRunData(file, false, false, null, LegalJustDateDocMaker.class.getName());
+ WriteLineDocTask wldt = new WriteLineDocTask(runData);
+ wldt.doLogic();
+ wldt.close();
+
+ BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file), "utf-8"));
+ try {
+ String line = br.readLine();
+ assertHeaderLine(line);
+ line = br.readLine();
+ assertNotNull(line);
+ } finally {
+ br.close();
+ }
+ }
+
+ public void testEmptyDoc() throws Exception {
+ File file = new File(getWorkDir(), "one-line");
+ PerfRunData runData = createPerfRunData(file, false, true, null, EmptyDocMaker.class.getName());
+ WriteLineDocTask wldt = new WriteLineDocTask(runData);
+ wldt.doLogic();
+ wldt.close();
+
+ BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file), "utf-8"));
+ try {
+ String line = br.readLine();
+ assertHeaderLine(line);
+ line = br.readLine();
+ assertNotNull(line);
+ } finally {
+ br.close();
+ }
+ }
+
public void testMultiThreaded() throws Exception {
File file = new File(getWorkDir(), "one-line");
- PerfRunData runData = createPerfRunData(file, false, null, ThreadingDocMaker.class.getName());
+ PerfRunData runData = createPerfRunData(file, false, false, null, ThreadingDocMaker.class.getName());
final WriteLineDocTask wldt = new WriteLineDocTask(runData);
Thread[] threads = new Thread[10];
for (int i = 0; i < threads.length; i++) {
@@ -269,8 +341,10 @@ public class WriteLineDocTaskTest extend
Set<String> ids = new HashSet<String>();
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file), "utf-8"));
try {
+ String line = br.readLine();
+ assertHeaderLine(line); // header line is written once, no matter how many threads there are
for (int i = 0; i < threads.length; i++) {
- String line = br.readLine();
+ line = br.readLine();
String[] parts = line.split(Character.toString(WriteLineDocTask.SEP));
assertEquals(3, parts.length);
// check that all thread names written are the same in the same line