You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2010/12/20 23:30:51 UTC

svn commit: r1051306 - in /lucene/dev/branches/branch_3x: ./ lucene/ lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ lucene/contrib/benchmark/src/te...

Author: mikemccand
Date: Mon Dec 20 22:30:50 2010
New Revision: 1051306

URL: http://svn.apache.org/viewvc?rev=1051306&view=rev
Log:
LUCENE-2826: LineDocSource assigns stable docids; add 2 NumericFields derived from date in the line doc file

Modified:
    lucene/dev/branches/branch_3x/   (props changed)
    lucene/dev/branches/branch_3x/lucene/   (props changed)
    lucene/dev/branches/branch_3x/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocData.java
    lucene/dev/branches/branch_3x/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocMaker.java
    lucene/dev/branches/branch_3x/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocSource.java
    lucene/dev/branches/branch_3x/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java
    lucene/dev/branches/branch_3x/lucene/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java
    lucene/dev/branches/branch_3x/solr/   (props changed)

Modified: lucene/dev/branches/branch_3x/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocData.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocData.java?rev=1051306&r1=1051305&r2=1051306&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocData.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocData.java Mon Dec 20 22:30:50 2010
@@ -29,6 +29,7 @@ public class DocData {
   private String body;
   private String title;
   private String date;
+  private int id;
   private Properties props;
   
   public void clear() {
@@ -37,6 +38,7 @@ public class DocData {
     title = null;
     date = null;
     props = null;
+    id = -1;
   }
   
   public String getBody() {
@@ -57,6 +59,10 @@ public class DocData {
     return name;
   }
 
+  public int getID() {
+    return id;
+  }
+
   public Properties getProps() {
     return props;
   }
@@ -85,6 +91,10 @@ public class DocData {
     this.name = name;
   }
 
+  public void setID(int id) {
+    this.id = id;
+  }
+
   public void setProps(Properties props) {
     this.props = props;
   }

Modified: lucene/dev/branches/branch_3x/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocMaker.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocMaker.java?rev=1051306&r1=1051305&r2=1051306&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocMaker.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocMaker.java Mon Dec 20 22:30:50 2010
@@ -20,14 +20,21 @@ package org.apache.lucene.benchmark.byTa
 import java.io.IOException;
 import java.io.UnsupportedEncodingException;
 import java.util.HashMap;
+import java.util.Calendar;
 import java.util.Map;
 import java.util.Properties;
+import java.util.Locale;
 import java.util.Random;
+import java.util.Date;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.text.SimpleDateFormat;
+import java.text.ParsePosition;
 
 import org.apache.lucene.benchmark.byTask.utils.Config;
 import org.apache.lucene.benchmark.byTask.utils.Format;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
+import org.apache.lucene.document.NumericField;
 import org.apache.lucene.document.Field.Index;
 import org.apache.lucene.document.Field.Store;
 import org.apache.lucene.document.Field.TermVector;
@@ -82,6 +89,7 @@ public class DocMaker {
   static class DocState {
     
     private final Map<String,Field> fields;
+    private final Map<String,NumericField> numericFields;
     private final boolean reuseFields;
     final Document doc;
     DocData docData = new DocData();
@@ -92,6 +100,7 @@ public class DocMaker {
       
       if (reuseFields) {
         fields =  new HashMap<String,Field>();
+        numericFields = new HashMap<String,NumericField>();
         
         // Initialize the map with the default fields.
         fields.put(BODY_FIELD, new Field(BODY_FIELD, "", bodyStore, bodyIndex, termVector));
@@ -99,9 +108,13 @@ public class DocMaker {
         fields.put(DATE_FIELD, new Field(DATE_FIELD, "", store, index, termVector));
         fields.put(ID_FIELD, new Field(ID_FIELD, "", Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));
         fields.put(NAME_FIELD, new Field(NAME_FIELD, "", store, index, termVector));
+
+        numericFields.put(DATE_MSEC_FIELD, new NumericField(DATE_MSEC_FIELD));
+        numericFields.put(TIME_SEC_FIELD, new NumericField(TIME_SEC_FIELD));
         
         doc = new Document();
       } else {
+        numericFields = null;
         fields = null;
         doc = null;
       }
@@ -124,18 +137,42 @@ public class DocMaker {
       }
       return f;
     }
+
+    NumericField getNumericField(String name) {
+      if (!reuseFields) {
+        return new NumericField(name);
+      }
+
+      NumericField f = numericFields.get(name);
+      if (f == null) {
+        f = new NumericField(name);
+        numericFields.put(name, f);
+      }
+      return f;
+    }
   }
   
-  private int numDocsCreated = 0;
   private boolean storeBytes = false;
 
+  private static class DateUtil {
+    public SimpleDateFormat parser = new SimpleDateFormat("dd-MMM-yyyy HH:mm:ss", Locale.US);
+    public Calendar cal = Calendar.getInstance();
+    public ParsePosition pos = new ParsePosition(0);
+    public DateUtil() {
+      parser.setLenient(true);
+    }
+  }
+
   // leftovers are thread local, because it is unsafe to share residues between threads
   private ThreadLocal<LeftOver> leftovr = new ThreadLocal<LeftOver>();
   private ThreadLocal<DocState> docState = new ThreadLocal<DocState>();
+  private ThreadLocal<DateUtil> dateParsers = new ThreadLocal<DateUtil>();
 
   public static final String BODY_FIELD = "body";
   public static final String TITLE_FIELD = "doctitle";
   public static final String DATE_FIELD = "docdate";
+  public static final String DATE_MSEC_FIELD = "docdatenum";
+  public static final String TIME_SEC_FIELD = "doctimesecnum";
   public static final String ID_FIELD = "docid";
   public static final String BYTES_FIELD = "bytes";
   public static final String NAME_FIELD = "docname";
@@ -155,6 +192,7 @@ public class DocMaker {
   private int lastPrintedNumUniqueTexts = 0;
 
   private long lastPrintedNumUniqueBytes = 0;
+  private final AtomicInteger numDocsCreated = new AtomicInteger();
 
   private int printNum = 0;
 
@@ -169,7 +207,16 @@ public class DocMaker {
     
     // Set ID_FIELD
     Field idField = ds.getField(ID_FIELD, storeVal, Index.NOT_ANALYZED_NO_NORMS, termVecVal);
-    idField.setValue("doc" + (r != null ? r.nextInt(updateDocIDLimit) : incrNumDocsCreated()));
+    int id;
+    if (r != null) {
+      id = r.nextInt(updateDocIDLimit);
+    } else {
+      id = docData.getID();
+      if (id == -1) {
+        id = numDocsCreated.getAndIncrement();
+      }
+    }
+    idField.setValue(Integer.toString(id));
     doc.add(idField);
     
     // Set NAME_FIELD
@@ -181,13 +228,39 @@ public class DocMaker {
     doc.add(nameField);
     
     // Set DATE_FIELD
-    String date = docData.getDate();
+    DateUtil util = dateParsers.get();
+    if (util == null) {
+      util = new DateUtil();
+      dateParsers.set(util);
+    }
+    Date date = null;
+    String dateString = docData.getDate();
+    if (dateString != null) {
+      util.pos.setIndex(0);
+      date = util.parser.parse(dateString, util.pos);
+      //System.out.println(dateString + " parsed to " + date);
+    } else {
+      dateString = "";
+    }
+    Field dateStringField = ds.getField(DATE_FIELD, storeVal, indexVal, termVecVal);
+    dateStringField.setValue(dateString);
+    doc.add(dateStringField);
+
     if (date == null) {
-      date = "";
+      // just set to right now
+      date = new Date();
     }
-    Field dateField = ds.getField(DATE_FIELD, storeVal, indexVal, termVecVal);
-    dateField.setValue(date);
+
+    NumericField dateField = ds.getNumericField(DATE_MSEC_FIELD);
+    dateField.setLongValue(date.getTime());
     doc.add(dateField);
+
+    util.cal.setTime(date);
+    final int sec = util.cal.get(Calendar.HOUR_OF_DAY)*3600 + util.cal.get(Calendar.MINUTE)*60 + util.cal.get(Calendar.SECOND);
+
+    NumericField timeSecField = ds.getNumericField(TIME_SEC_FIELD);
+    timeSecField.setIntValue(sec);
+    doc.add(timeSecField);
     
     // Set TITLE_FIELD
     String title = docData.getTitle();
@@ -252,10 +325,6 @@ public class DocMaker {
     return ds;
   }
 
-  protected synchronized int incrNumDocsCreated() {
-    return numDocsCreated++;
-  }
-
   /**
    * Closes the {@link DocMaker}. The base implementation closes the
    * {@link ContentSource}, and it can be overridden to do more work (but make
@@ -363,7 +432,7 @@ public class DocMaker {
     // re-initiate since properties by round may have changed.
     setConfig(config);
     source.resetInputs();
-    numDocsCreated = 0;
+    numDocsCreated.set(0);
     resetLeftovers();
   }
   

Modified: lucene/dev/branches/branch_3x/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocSource.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocSource.java?rev=1051306&r1=1051305&r2=1051306&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocSource.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocSource.java Mon Dec 20 22:30:50 2010
@@ -48,6 +48,7 @@ public class LineDocSource extends Conte
 
   private File file;
   private BufferedReader reader;
+  private int readCount;
 
   private synchronized void openFile() {
     try {
@@ -71,9 +72,12 @@ public class LineDocSource extends Conte
   
   @Override
   public DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException {
-    String line;
+    final String line;
+    final int myID;
+    
     synchronized(this) {
       line = reader.readLine();
+      myID = readCount++;
       if (line == null) {
         if (!forever) {
           throw new NoMoreDataException();
@@ -96,6 +100,7 @@ public class LineDocSource extends Conte
     }
     // The date String was written in the format of DateTools.dateToString.
     docData.clear();
+    docData.setID(myID);
     docData.setBody(line.substring(1 + spot2, line.length()));
     docData.setTitle(line.substring(0, spot));
     docData.setDate(line.substring(1 + spot, spot2));

Modified: lucene/dev/branches/branch_3x/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java?rev=1051306&r1=1051305&r2=1051306&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java Mon Dec 20 22:30:50 2010
@@ -26,6 +26,7 @@ import org.apache.lucene.benchmark.byTas
 import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Fieldable;
+import org.apache.lucene.document.NumericField;
 
 /**
  * Simple task to test performance of tokenizers.  It just
@@ -67,7 +68,7 @@ public class ReadTokensTask extends Perf
     Analyzer analyzer = getRunData().getAnalyzer();
     int tokenCount = 0;
     for(final Fieldable field : fields) {
-      if (!field.isTokenized()) continue;
+      if (!field.isTokenized() || field instanceof NumericField) continue;
       
       final TokenStream stream;
       final TokenStream streamValue = field.tokenStreamValue();

Modified: lucene/dev/branches/branch_3x/lucene/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java?rev=1051306&r1=1051305&r2=1051306&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java Mon Dec 20 22:30:50 2010
@@ -473,8 +473,8 @@ public class TestPerfTasksLogic extends 
     while(terms.next()) {
       Term term = terms.term();
       /* not-tokenized, but indexed field */
-      if (term != null && term.field() != DocMaker.ID_FIELD) { 
-        termDocs.seek(terms.term());
+      if (term != null && term.field() != DocMaker.ID_FIELD && term.field() != DocMaker.DATE_MSEC_FIELD && term.field() != DocMaker.TIME_SEC_FIELD) {
+          termDocs.seek(terms.term());
         while (termDocs.next())
           totalTokenCount2 += termDocs.freq();
       }