You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by gs...@apache.org on 2007/03/23 04:48:13 UTC

svn commit: r521569 - in /lucene/java/trunk/contrib/benchmark: ./ src/java/org/apache/lucene/benchmark/byTask/feeds/ src/java/org/apache/lucene/benchmark/byTask/tasks/

Author: gsingers
Date: Thu Mar 22 20:48:12 2007
New Revision: 521569

URL: http://svn.apache.org/viewvc?view=rev&rev=521569
Log:
LUCENE-837:
Added optional bytes field to store on the Document.  Enabled ReutersDocMaker w/ the ability to store byte data in a field.  If the param is set (see the javadocs) it will store the contents of the body as a UTF-8 byte array.

Then, the SearchTravRetLoadFieldSelectorTask (whew) can take in parameters specifying what fields to load (others are ignored by default)

Added:
    lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetLoadFieldSelectorTask.java   (with props)
Modified:
    lucene/java/trunk/contrib/benchmark/CHANGES.txt
    lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/BasicDocMaker.java
    lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ReutersDocMaker.java
    lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTask.java

Modified: lucene/java/trunk/contrib/benchmark/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/CHANGES.txt?view=diff&rev=521569&r1=521568&r2=521569
==============================================================================
--- lucene/java/trunk/contrib/benchmark/CHANGES.txt (original)
+++ lucene/java/trunk/contrib/benchmark/CHANGES.txt Thu Mar 22 20:48:12 2007
@@ -4,6 +4,12 @@
 
 $Id:$
 
+3/22/07
+
+-Moved withRetrieve() call out of the loop in ReadTask
+-Added SearchTravRetLoadFieldSelectorTask to help benchmark some of the FieldSelector capabilities
+-Added options to store content bytes on the Reuters Doc (and others, but Reuters is the only one w/ it enabled)
+
 3/21/07
 
 Tests (for benchmarking code correctness) were added - LUCENE-840.

Modified: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/BasicDocMaker.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/BasicDocMaker.java?view=diff&rev=521569&r1=521568&r2=521569
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/BasicDocMaker.java (original)
+++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/BasicDocMaker.java Thu Mar 22 20:48:12 2007
@@ -17,18 +17,18 @@
  * limitations under the License.
  */
 
+import org.apache.lucene.benchmark.byTask.utils.Config;
+import org.apache.lucene.benchmark.byTask.utils.Format;
+import org.apache.lucene.document.DateTools;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+
 import java.io.File;
 import java.util.ArrayList;
 import java.util.Date;
 import java.util.Iterator;
 import java.util.Properties;
 
-import org.apache.lucene.document.DateTools;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-import org.apache.lucene.benchmark.byTask.utils.Config;
-import org.apache.lucene.benchmark.byTask.utils.Format;
-
 
 /**
  * Create documents for the test.
@@ -44,6 +44,7 @@
     Date date;
     String title;
     String body;
+    byte [] bytes;
     Properties props;
   }
   
@@ -122,6 +123,10 @@
         docData.body = docData.body.substring(size); // some left
       }
       doc.add(new Field(BODY_FIELD, bdy, storeVal, indexVal, termVecVal));
+    }
+    if (docData.bytes != null && docData.bytes.length != 0)
+    {
+      doc.add(new Field("bytes", docData.bytes, Field.Store.YES));
     }
     if (docData.props!=null) {
       for (Iterator it = docData.props.keySet().iterator(); it.hasNext(); ) {

Modified: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ReutersDocMaker.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ReutersDocMaker.java?view=diff&rev=521569&r1=521568&r2=521569
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ReutersDocMaker.java (original)
+++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ReutersDocMaker.java Thu Mar 22 20:48:12 2007
@@ -17,6 +17,8 @@
  * limitations under the License.
  */
 
+import org.apache.lucene.benchmark.byTask.utils.Config;
+
 import java.io.BufferedReader;
 import java.io.File;
 import java.io.FileReader;
@@ -25,11 +27,14 @@
 import java.util.ArrayList;
 import java.util.Locale;
 
-import org.apache.lucene.benchmark.byTask.utils.Config;
-
 
 /**
  * A DocMaker using the Reuters collection for its input.
+ *
+ * Config properties:
+ * docs.dir=<path to the docs dir| Default: reuters-out>
+ * reuters.doc.maker.store.bytes=true|false Default: false
+ *
  */
 public class ReutersDocMaker extends BasicDocMaker {
 
@@ -38,7 +43,7 @@
   private ArrayList inputFiles = new ArrayList();
   private int nextFile = 0;
   private int iteration=0;
-  
+  private boolean storeBytes = false;
   /* (non-Javadoc)
    * @see SimpleDocMaker#setConfig(java.util.Properties)
    */
@@ -46,6 +51,8 @@
     super.setConfig(config);
     String d = config.get("docs.dir","reuters-out");
     dataDir = new File(new File("work"),d);
+    storeBytes = config.get("reuters.doc.maker.store.bytes", false);
+
     collectFiles(dataDir,inputFiles);
     if (inputFiles.size()==0) {
       throw new RuntimeException("No txt files in dataDir: "+dataDir.getAbsolutePath());
@@ -89,6 +96,10 @@
     dd.name = name;
     dd.title = title;
     dd.body = bodyBuf.toString();
+    if (storeBytes == true)
+    {
+      dd.bytes = dd.body.getBytes("UTF-8");
+    }
     return dd;
   }
 

Modified: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTask.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTask.java?view=diff&rev=521569&r1=521568&r2=521569
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTask.java (original)
+++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTask.java Thu Mar 22 20:48:12 2007
@@ -26,6 +26,7 @@
 import org.apache.lucene.search.Query;
 import org.apache.lucene.store.Directory;
 
+import java.io.IOException;
 
 
 /**
@@ -76,17 +77,15 @@
       Hits hits = searcher.search(q);
       //System.out.println("searched: "+q);
       
-      if (withTraverse() && hits!=null) {
-        Document doc = null;
-        int traversalSize = Math.min(hits.length(), traversalSize());
-        if (traversalSize > 0) {
+      if (withTraverse() && hits!=null) {
+        int traversalSize = Math.min(hits.length(), traversalSize());
+        if (traversalSize > 0) {
+          boolean retrieve = withRetrieve();
           for (int m = 0; m < hits.length(); m++) {
             int id = hits.id(m);
             res++;
-
-            if (withRetrieve()) {
-              doc = ir.document(id);
-              res += (doc==null ? 0 : 1);
+            if (retrieve) {
+              res += retrieveDoc(ir, id);
             }
           }
         }
@@ -101,6 +100,10 @@
     return res;
   }
 
+  protected int retrieveDoc(IndexReader ir, int id) throws IOException {
+    return (ir.document(id) == null ? 0 : 1);
+  }
+
   /**
    * Return query maker used for this task.
    */
@@ -122,18 +125,18 @@
   public abstract boolean withTraverse ();
 
   /**
-   * Specify the number of hits to traverse.  Tasks should override this if they want to restrict the number
-   * of hits that are traversed when {@link #withTraverse()} is true. Must be greater than 0.
-   *
-   * Read task calculates the traversal as: Math.min(hits.length(), traversalSize())
-   * @return Integer.MAX_VALUE
-   */
-  public int traversalSize()
-  {
-    return Integer.MAX_VALUE;
-  }
-
-  /**
+   * Specify the number of hits to traverse.  Tasks should override this if they want to restrict the number
+   * of hits that are traversed when {@link #withTraverse()} is true. Must be greater than 0.
+   *
+   * Read task calculates the traversal as: Math.min(hits.length(), traversalSize())
+   * @return Integer.MAX_VALUE
+   */
+  public int traversalSize()
+  {
+    return Integer.MAX_VALUE;
+  }
+
+  /**
    * Return true if, with search & results traversing, docs should be retrieved.
    */
   public abstract boolean withRetrieve ();

Added: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetLoadFieldSelectorTask.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetLoadFieldSelectorTask.java?view=auto&rev=521569
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetLoadFieldSelectorTask.java (added)
+++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetLoadFieldSelectorTask.java Thu Mar 22 20:48:12 2007
@@ -0,0 +1,55 @@
+package org.apache.lucene.benchmark.byTask.tasks;
+/**
+ * Created by IntelliJ IDEA.
+ * User: Grant Ingersoll
+ * Date: Mar 22, 2007
+ * Time: 10:04:49 PM
+ * $Id:$
+ * Copyright 2007.  Center For Natural Language Processing
+ */
+
+import org.apache.lucene.benchmark.byTask.PerfRunData;
+import org.apache.lucene.document.FieldSelector;
+import org.apache.lucene.document.SetBasedFieldSelector;
+import org.apache.lucene.index.IndexReader;
+
+import java.util.StringTokenizer;
+import java.util.Set;
+import java.util.HashSet;
+import java.util.Collections;
+import java.io.IOException;
+
+/**
+ * Search and Travrese and Retrieve docs task using a SetBasedFieldSelector.
+ *
+ * <p>Note: This task reuses the reader if it is already open.
+ * Otherwise a reader is opened at start and closed at the end.
+ *
+ * Takes optional param: comma separated list of Fields to load.
+ */
+public class SearchTravRetLoadFieldSelectorTask extends SearchTravTask {
+
+  protected FieldSelector fieldSelector;
+  public SearchTravRetLoadFieldSelectorTask(PerfRunData runData) {
+    super(runData);
+    
+  }
+
+  public boolean withRetrieve() {
+    return true;
+  }
+
+
+  protected int retrieveDoc(IndexReader ir, int id) throws IOException {
+    return (ir.document(id, fieldSelector) == null ? 0 : 1);
+  }
+
+  public void setParams(String params) {
+    Set fieldsToLoad = new HashSet();
+    for (StringTokenizer tokenizer = new StringTokenizer(params, ","); tokenizer.hasMoreTokens();) {
+      String s = tokenizer.nextToken();
+      fieldsToLoad.add(s);
+    }
+    fieldSelector = new SetBasedFieldSelector(fieldsToLoad, Collections.EMPTY_SET);
+  }
+}

Propchange: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetLoadFieldSelectorTask.java
------------------------------------------------------------------------------
    svn:eol-style = native