You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2010/12/20 23:30:51 UTC
svn commit: r1051306 - in /lucene/dev/branches/branch_3x: ./ lucene/
lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/
lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/
lucene/contrib/benchmark/src/te...
Author: mikemccand
Date: Mon Dec 20 22:30:50 2010
New Revision: 1051306
URL: http://svn.apache.org/viewvc?rev=1051306&view=rev
Log:
LUCENE-2826: LineDocSource assigns stable docids; add 2 NumericFields derived from date in the line doc file
Modified:
lucene/dev/branches/branch_3x/ (props changed)
lucene/dev/branches/branch_3x/lucene/ (props changed)
lucene/dev/branches/branch_3x/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocData.java
lucene/dev/branches/branch_3x/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocMaker.java
lucene/dev/branches/branch_3x/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocSource.java
lucene/dev/branches/branch_3x/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java
lucene/dev/branches/branch_3x/lucene/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java
lucene/dev/branches/branch_3x/solr/ (props changed)
Modified: lucene/dev/branches/branch_3x/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocData.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocData.java?rev=1051306&r1=1051305&r2=1051306&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocData.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocData.java Mon Dec 20 22:30:50 2010
@@ -29,6 +29,7 @@ public class DocData {
private String body;
private String title;
private String date;
+ private int id;
private Properties props;
public void clear() {
@@ -37,6 +38,7 @@ public class DocData {
title = null;
date = null;
props = null;
+ id = -1;
}
public String getBody() {
@@ -57,6 +59,10 @@ public class DocData {
return name;
}
+ public int getID() {
+ return id;
+ }
+
public Properties getProps() {
return props;
}
@@ -85,6 +91,10 @@ public class DocData {
this.name = name;
}
+ public void setID(int id) {
+ this.id = id;
+ }
+
public void setProps(Properties props) {
this.props = props;
}
Modified: lucene/dev/branches/branch_3x/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocMaker.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocMaker.java?rev=1051306&r1=1051305&r2=1051306&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocMaker.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocMaker.java Mon Dec 20 22:30:50 2010
@@ -20,14 +20,21 @@ package org.apache.lucene.benchmark.byTa
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.HashMap;
+import java.util.Calendar;
import java.util.Map;
import java.util.Properties;
+import java.util.Locale;
import java.util.Random;
+import java.util.Date;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.text.SimpleDateFormat;
+import java.text.ParsePosition;
import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.lucene.benchmark.byTask.utils.Format;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
+import org.apache.lucene.document.NumericField;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.Field.TermVector;
@@ -82,6 +89,7 @@ public class DocMaker {
static class DocState {
private final Map<String,Field> fields;
+ private final Map<String,NumericField> numericFields;
private final boolean reuseFields;
final Document doc;
DocData docData = new DocData();
@@ -92,6 +100,7 @@ public class DocMaker {
if (reuseFields) {
fields = new HashMap<String,Field>();
+ numericFields = new HashMap<String,NumericField>();
// Initialize the map with the default fields.
fields.put(BODY_FIELD, new Field(BODY_FIELD, "", bodyStore, bodyIndex, termVector));
@@ -99,9 +108,13 @@ public class DocMaker {
fields.put(DATE_FIELD, new Field(DATE_FIELD, "", store, index, termVector));
fields.put(ID_FIELD, new Field(ID_FIELD, "", Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));
fields.put(NAME_FIELD, new Field(NAME_FIELD, "", store, index, termVector));
+
+ numericFields.put(DATE_MSEC_FIELD, new NumericField(DATE_MSEC_FIELD));
+ numericFields.put(TIME_SEC_FIELD, new NumericField(TIME_SEC_FIELD));
doc = new Document();
} else {
+ numericFields = null;
fields = null;
doc = null;
}
@@ -124,18 +137,42 @@ public class DocMaker {
}
return f;
}
+
+ NumericField getNumericField(String name) {
+ if (!reuseFields) {
+ return new NumericField(name);
+ }
+
+ NumericField f = numericFields.get(name);
+ if (f == null) {
+ f = new NumericField(name);
+ numericFields.put(name, f);
+ }
+ return f;
+ }
}
- private int numDocsCreated = 0;
private boolean storeBytes = false;
+ private static class DateUtil {
+ public SimpleDateFormat parser = new SimpleDateFormat("dd-MMM-yyyy HH:mm:ss", Locale.US);
+ public Calendar cal = Calendar.getInstance();
+ public ParsePosition pos = new ParsePosition(0);
+ public DateUtil() {
+ parser.setLenient(true);
+ }
+ }
+
// leftovers are thread local, because it is unsafe to share residues between threads
private ThreadLocal<LeftOver> leftovr = new ThreadLocal<LeftOver>();
private ThreadLocal<DocState> docState = new ThreadLocal<DocState>();
+ private ThreadLocal<DateUtil> dateParsers = new ThreadLocal<DateUtil>();
public static final String BODY_FIELD = "body";
public static final String TITLE_FIELD = "doctitle";
public static final String DATE_FIELD = "docdate";
+ public static final String DATE_MSEC_FIELD = "docdatenum";
+ public static final String TIME_SEC_FIELD = "doctimesecnum";
public static final String ID_FIELD = "docid";
public static final String BYTES_FIELD = "bytes";
public static final String NAME_FIELD = "docname";
@@ -155,6 +192,7 @@ public class DocMaker {
private int lastPrintedNumUniqueTexts = 0;
private long lastPrintedNumUniqueBytes = 0;
+ private final AtomicInteger numDocsCreated = new AtomicInteger();
private int printNum = 0;
@@ -169,7 +207,16 @@ public class DocMaker {
// Set ID_FIELD
Field idField = ds.getField(ID_FIELD, storeVal, Index.NOT_ANALYZED_NO_NORMS, termVecVal);
- idField.setValue("doc" + (r != null ? r.nextInt(updateDocIDLimit) : incrNumDocsCreated()));
+ int id;
+ if (r != null) {
+ id = r.nextInt(updateDocIDLimit);
+ } else {
+ id = docData.getID();
+ if (id == -1) {
+ id = numDocsCreated.getAndIncrement();
+ }
+ }
+ idField.setValue(Integer.toString(id));
doc.add(idField);
// Set NAME_FIELD
@@ -181,13 +228,39 @@ public class DocMaker {
doc.add(nameField);
// Set DATE_FIELD
- String date = docData.getDate();
+ DateUtil util = dateParsers.get();
+ if (util == null) {
+ util = new DateUtil();
+ dateParsers.set(util);
+ }
+ Date date = null;
+ String dateString = docData.getDate();
+ if (dateString != null) {
+ util.pos.setIndex(0);
+ date = util.parser.parse(dateString, util.pos);
+ //System.out.println(dateString + " parsed to " + date);
+ } else {
+ dateString = "";
+ }
+ Field dateStringField = ds.getField(DATE_FIELD, storeVal, indexVal, termVecVal);
+ dateStringField.setValue(dateString);
+ doc.add(dateStringField);
+
if (date == null) {
- date = "";
+ // just set to right now
+ date = new Date();
}
- Field dateField = ds.getField(DATE_FIELD, storeVal, indexVal, termVecVal);
- dateField.setValue(date);
+
+ NumericField dateField = ds.getNumericField(DATE_MSEC_FIELD);
+ dateField.setLongValue(date.getTime());
doc.add(dateField);
+
+ util.cal.setTime(date);
+ final int sec = util.cal.get(Calendar.HOUR_OF_DAY)*3600 + util.cal.get(Calendar.MINUTE)*60 + util.cal.get(Calendar.SECOND);
+
+ NumericField timeSecField = ds.getNumericField(TIME_SEC_FIELD);
+ timeSecField.setIntValue(sec);
+ doc.add(timeSecField);
// Set TITLE_FIELD
String title = docData.getTitle();
@@ -252,10 +325,6 @@ public class DocMaker {
return ds;
}
- protected synchronized int incrNumDocsCreated() {
- return numDocsCreated++;
- }
-
/**
* Closes the {@link DocMaker}. The base implementation closes the
* {@link ContentSource}, and it can be overridden to do more work (but make
@@ -363,7 +432,7 @@ public class DocMaker {
// re-initiate since properties by round may have changed.
setConfig(config);
source.resetInputs();
- numDocsCreated = 0;
+ numDocsCreated.set(0);
resetLeftovers();
}
Modified: lucene/dev/branches/branch_3x/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocSource.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocSource.java?rev=1051306&r1=1051305&r2=1051306&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocSource.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocSource.java Mon Dec 20 22:30:50 2010
@@ -48,6 +48,7 @@ public class LineDocSource extends Conte
private File file;
private BufferedReader reader;
+ private int readCount;
private synchronized void openFile() {
try {
@@ -71,9 +72,12 @@ public class LineDocSource extends Conte
@Override
public DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException {
- String line;
+ final String line;
+ final int myID;
+
synchronized(this) {
line = reader.readLine();
+ myID = readCount++;
if (line == null) {
if (!forever) {
throw new NoMoreDataException();
@@ -96,6 +100,7 @@ public class LineDocSource extends Conte
}
// The date String was written in the format of DateTools.dateToString.
docData.clear();
+ docData.setID(myID);
docData.setBody(line.substring(1 + spot2, line.length()));
docData.setTitle(line.substring(0, spot));
docData.setDate(line.substring(1 + spot, spot2));
Modified: lucene/dev/branches/branch_3x/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java?rev=1051306&r1=1051305&r2=1051306&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java Mon Dec 20 22:30:50 2010
@@ -26,6 +26,7 @@ import org.apache.lucene.benchmark.byTas
import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Fieldable;
+import org.apache.lucene.document.NumericField;
/**
* Simple task to test performance of tokenizers. It just
@@ -67,7 +68,7 @@ public class ReadTokensTask extends Perf
Analyzer analyzer = getRunData().getAnalyzer();
int tokenCount = 0;
for(final Fieldable field : fields) {
- if (!field.isTokenized()) continue;
+ if (!field.isTokenized() || field instanceof NumericField) continue;
final TokenStream stream;
final TokenStream streamValue = field.tokenStreamValue();
Modified: lucene/dev/branches/branch_3x/lucene/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java?rev=1051306&r1=1051305&r2=1051306&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java Mon Dec 20 22:30:50 2010
@@ -473,8 +473,8 @@ public class TestPerfTasksLogic extends
while(terms.next()) {
Term term = terms.term();
/* not-tokenized, but indexed field */
- if (term != null && term.field() != DocMaker.ID_FIELD) {
- termDocs.seek(terms.term());
+ if (term != null && term.field() != DocMaker.ID_FIELD && term.field() != DocMaker.DATE_MSEC_FIELD && term.field() != DocMaker.TIME_SEC_FIELD) {
+ termDocs.seek(terms.term());
while (termDocs.next())
totalTokenCount2 += termDocs.freq();
}