You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by cu...@apache.org on 2005/05/04 21:57:21 UTC

svn commit: r168178 - in /incubator/nutch/trunk: ./ src/java/org/apache/nutch/ipc/ src/java/org/apache/nutch/searcher/ src/java/org/apache/nutch/servlet/ src/web/jsp/

Author: cutting
Date: Wed May  4 12:57:20 2005
New Revision: 168178

URL: http://svn.apache.org/viewcvs?rev=168178&view=rev
Log:
Add result sorting & deduping by fields other than site.

Modified:
    incubator/nutch/trunk/CHANGES.txt
    incubator/nutch/trunk/src/java/org/apache/nutch/ipc/RPC.java
    incubator/nutch/trunk/src/java/org/apache/nutch/searcher/DistributedSearch.java
    incubator/nutch/trunk/src/java/org/apache/nutch/searcher/Hit.java
    incubator/nutch/trunk/src/java/org/apache/nutch/searcher/Hits.java
    incubator/nutch/trunk/src/java/org/apache/nutch/searcher/IndexSearcher.java
    incubator/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java
    incubator/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java
    incubator/nutch/trunk/src/java/org/apache/nutch/searcher/OpenSearchServlet.java
    incubator/nutch/trunk/src/java/org/apache/nutch/searcher/Searcher.java
    incubator/nutch/trunk/src/java/org/apache/nutch/servlet/Cached.java
    incubator/nutch/trunk/src/web/jsp/anchors.jsp
    incubator/nutch/trunk/src/web/jsp/cached.jsp
    incubator/nutch/trunk/src/web/jsp/explain.jsp
    incubator/nutch/trunk/src/web/jsp/search.jsp
    incubator/nutch/trunk/src/web/jsp/text.jsp

Modified: incubator/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/CHANGES.txt?rev=168178&r1=168177&r2=168178&view=diff
==============================================================================
--- incubator/nutch/trunk/CHANGES.txt (original)
+++ incubator/nutch/trunk/CHANGES.txt Wed May  4 12:57:20 2005
@@ -73,6 +73,9 @@
 15. Make query boosts for host, title, anchor and phrase matches
     configurable.  (Piotr Kosiorowski via cutting, 20050419)
 
+16. Add support for sorting search results and search-time deduping by
+    fields other than site.
+
 
 Release 0.6
 

Modified: incubator/nutch/trunk/src/java/org/apache/nutch/ipc/RPC.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/java/org/apache/nutch/ipc/RPC.java?rev=168178&r1=168177&r2=168178&view=diff
==============================================================================
--- incubator/nutch/trunk/src/java/org/apache/nutch/ipc/RPC.java (original)
+++ incubator/nutch/trunk/src/java/org/apache/nutch/ipc/RPC.java Wed May  4 12:57:20 2005
@@ -380,14 +380,14 @@
         public Writable call(Writable param) throws IOException {
           try {
             Invocation call = (Invocation)param;
-            if (verbose) LOG.info("Call: " + call);
+            if (verbose) log("Call: " + call);
 
             Method method =
               implementation.getMethod(call.getMethodName(),
                                        call.getParameterClasses());
 
             Object value = method.invoke(instance, call.getParameters());
-            if (verbose) LOG.info("Return: " + value);
+            if (verbose) log("Return: "+value);
 
             return new ObjectWritable(method.getReturnType(), value);
 
@@ -407,6 +407,12 @@
           }
         }
       };
+  }
+
+  private static void log(String value) {
+    if (value!= null && value.length() > 55)
+      value = value.substring(0, 55)+"...";
+    LOG.info(value);
   }
   
 }

Modified: incubator/nutch/trunk/src/java/org/apache/nutch/searcher/DistributedSearch.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/java/org/apache/nutch/searcher/DistributedSearch.java?rev=168178&r1=168177&r2=168178&view=diff
==============================================================================
--- incubator/nutch/trunk/src/java/org/apache/nutch/searcher/DistributedSearch.java (original)
+++ incubator/nutch/trunk/src/java/org/apache/nutch/searcher/DistributedSearch.java Wed May  4 12:57:20 2005
@@ -125,7 +125,8 @@
         GET_SEGMENTS = Protocol.class.getMethod
           ("getSegmentNames", new Class[] {});
         SEARCH = Protocol.class.getMethod
-          ("search", new Class[] { Query.class, Integer.TYPE});
+          ("search", new Class[] { Query.class, Integer.TYPE, String.class,
+                                   String.class, Boolean.TYPE});
         DETAILS = Protocol.class.getMethod
           ("getDetails", new Class[] { Hit.class});
         SUMMARY = Protocol.class.getMethod
@@ -179,30 +180,50 @@
         segmentToAddress.keySet().toArray(new String[segmentToAddress.size()]);
     }
 
-    public Hits search(Query query, int numHits) throws IOException {
+    public Hits search(final Query query, final int numHits,
+                       final String dedupField, final String sortField,
+                       final boolean reverse) throws IOException {
       long totalHits = 0;
       Hits[] segmentHits = new Hits[liveAddresses.length];
 
-      Object[][] params = new Object[liveAddresses.length][2];
+      Object[][] params = new Object[liveAddresses.length][5];
       for (int i = 0; i < params.length; i++) {
         params[i][0] = query;
         params[i][1] = new Integer(numHits);
+        params[i][2] = dedupField;
+        params[i][3] = sortField;
+        params[i][4] = Boolean.valueOf(reverse);
       }
       Hits[] results = (Hits[])RPC.call(SEARCH, params, liveAddresses);
 
-      TreeSet queue = new TreeSet();              // cull top hits from results
-      Comparable minValue = null;
+      TreeSet queue;                              // cull top hits from results
+
+      if (sortField == null || reverse) {
+        queue = new TreeSet(new Comparator() {
+            public int compare(Object o1, Object o2) {
+              return ((Comparable)o2).compareTo(o1); // reverse natural order
+            }
+          });
+      } else {
+        queue = new TreeSet();
+      }
+      
+      Comparable maxValue = null;
       for (int i = 0; i < results.length; i++) {
         Hits hits = results[i];
         if (hits == null) continue;
         totalHits += hits.getTotal();
         for (int j = 0; j < hits.getLength(); j++) {
           Hit h = hits.getHit(j);
-          if (minValue == null || h.getSite().compareTo(minValue) >= 0) {
-            queue.add(new Hit(i, h.getIndexDocNo(), h.getSite()));
+          if (maxValue == null ||
+              ((reverse || sortField == null)
+               ? h.getSortValue().compareTo(maxValue) >= 0
+               : h.getSortValue().compareTo(maxValue) <= 0)) {
+            queue.add(new Hit(i, h.getIndexDocNo(),
+                              h.getSortValue(), h.getDedupValue()));
             if (queue.size() > numHits) {         // if hit queue overfull
               queue.remove(queue.last());         // remove lowest in hit queue
-              minValue = ((Hit)queue.last()).getSite(); // reset minValue
+              maxValue = ((Hit)queue.last()).getSortValue(); // reset maxValue
             }
           }
         }
@@ -297,7 +318,7 @@
       Client client = new Client(addresses);
       //client.setTimeout(Integer.MAX_VALUE);
 
-      Hits hits = client.search(query, 10);
+      Hits hits = client.search(query, 10, null, null, false);
       System.out.println("Total hits: " + hits.getTotal());
       for (int i = 0; i < hits.getLength(); i++) {
         System.out.println(" "+i+" "+ client.getDetails(hits.getHit(i)));

Modified: incubator/nutch/trunk/src/java/org/apache/nutch/searcher/Hit.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/java/org/apache/nutch/searcher/Hit.java?rev=168178&r1=168177&r2=168178&view=diff
==============================================================================
--- incubator/nutch/trunk/src/java/org/apache/nutch/searcher/Hit.java (original)
+++ incubator/nutch/trunk/src/java/org/apache/nutch/searcher/Hit.java Wed May  4 12:57:20 2005
@@ -21,6 +21,7 @@
 import java.io.IOException;
 
 import org.apache.nutch.io.Writable;
+import org.apache.nutch.io.WritableComparable;
 import org.apache.nutch.io.UTF8;
 
 import java.util.logging.Logger;
@@ -33,25 +34,25 @@
 
   private int indexNo;                            // index id
   private int indexDocNo;                         // index-relative id
-  private float score;                            // its score for the query
-  private String site;                            // its website name
-  private boolean moreFromSiteExcluded;
+  private WritableComparable sortValue;           // value sorted on
+  private String dedupValue;                      // value to dedup on
+  private boolean moreFromDupExcluded;
 
   public Hit() {}
 
-  public Hit(int indexNo, int indexDocNo, float score, String site) {
-    this(indexDocNo, score, site);
+  public Hit(int indexNo, int indexDocNo) {
+    this(indexNo, indexDocNo, null, null);
+  }
+  public Hit(int indexNo, int indexDocNo,
+             WritableComparable sortValue,
+             String dedupValue) {
+    this(indexDocNo, sortValue, dedupValue);
     this.indexNo = indexNo;
   }
-  public Hit(int indexDocNo, float score, String site) {
+  public Hit(int indexDocNo, WritableComparable sortValue, String dedupValue) {
     this.indexDocNo = indexDocNo;
-    this.score = score;
-    // 20041006, xing
-    // The following fixes a bug that causes cached.jsp, text.jsp, etc.,
-    // to fail in distributed search. "Release 0.6, note 14" in CHANGES.txt
-    if (site == null)
-      site = "";
-    this.site = site;
+    this.sortValue = sortValue;
+    this.dedupValue = dedupValue == null ? "" : dedupValue;
   }
 
   /** Return the index number that this hit came from. */
@@ -61,31 +62,19 @@
   /** Return the document number of this hit within an index. */
   public int getIndexDocNo() { return indexDocNo; }
 
-  /** Return the degree to which this document matched the query. */
-  public float getScore() { return score; }
-
-  /** Return the name of this this document's website. */
-  public String getSite() { return site; }
+  /** Return the value of the field that hits are sorted on. */
+  public WritableComparable getSortValue() { return sortValue; }
 
-  /** True iff other, lower-scoring, hits from the same site have been excluded
-   * from the list which contains this hit.. */
-  public boolean moreFromSiteExcluded() { return moreFromSiteExcluded; }
-
-  /** True iff other, lower-scoring, hits from the same site have been excluded
-   * from the list which contains this hit.. */
-  public void setMoreFromSiteExcluded(boolean more){moreFromSiteExcluded=more;}
+  /** Return the value of the field that hits should be deduplicated on. */
+  public String getDedupValue() { return dedupValue; }
 
-  public void write(DataOutput out) throws IOException {
-    out.writeInt(indexDocNo);
-    out.writeFloat(score);
-    UTF8.writeString(out, site);
-  }
-
-  public void readFields(DataInput in) throws IOException {
-    indexDocNo = in.readInt();
-    score = in.readFloat();
-    site = UTF8.readString(in);
-  }
+  /** True iff other, lower-scoring, hits with the same dedup value have been
+   * excluded from the list which contains this hit.. */
+  public boolean moreFromDupExcluded() { return moreFromDupExcluded; }
+
+  /** True iff other, lower-scoring, hits with the same deup value have been
+   * excluded from the list which contains this hit.. */
+  public void setMoreFromDupExcluded(boolean more){moreFromDupExcluded=more;}
 
   /** Display as a string. */
   public String toString() {
@@ -106,14 +95,22 @@
 
   public int compareTo(Object o) {
     Hit other = (Hit)o;
-    if (other.score > this.score) {               // prefer higher scores
-      return 1;
-    } else if (other.score < this.score) {
-      return -1;
+    int compare = sortValue.compareTo(other.sortValue);
+    if (compare != 0) {
+      return compare;                             // use sortValue
     } else if (other.indexNo != this.indexNo) {
       return other.indexNo - this.indexNo;        // prefer later indexes
     } else {
       return other.indexDocNo - this.indexDocNo;  // prefer later docs
     }
   }
+
+  public void write(DataOutput out) throws IOException {
+    out.writeInt(indexDocNo);
+  }
+
+  public void readFields(DataInput in) throws IOException {
+    indexDocNo = in.readInt();
+  }
+
 }

Modified: incubator/nutch/trunk/src/java/org/apache/nutch/searcher/Hits.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/java/org/apache/nutch/searcher/Hits.java?rev=168178&r1=168177&r2=168178&view=diff
==============================================================================
--- incubator/nutch/trunk/src/java/org/apache/nutch/searcher/Hits.java (original)
+++ incubator/nutch/trunk/src/java/org/apache/nutch/searcher/Hits.java Wed May  4 12:57:20 2005
@@ -21,6 +21,8 @@
 import java.io.IOException;
 
 import org.apache.nutch.io.Writable;
+import org.apache.nutch.io.WritableComparable;
+import org.apache.nutch.io.UTF8;
 
 import java.util.logging.Logger;
 import org.apache.nutch.util.LogFormatter;
@@ -69,19 +71,45 @@
 
 
   public void write(DataOutput out) throws IOException {
-    out.writeLong(total);
-    out.writeInt(top.length);
+    out.writeLong(total);                         // write total hits
+    out.writeInt(top.length);                     // write hits returned
+    if (top.length > 0)                           // write sort value class
+      UTF8.writeString(out, top[0].getSortValue().getClass().getName());
+                      
     for (int i = 0; i < top.length; i++) {
-      top[i].write(out);
+      Hit h = top[i];
+      out.writeInt(h.getIndexDocNo());            // write indexDocNo
+      h.getSortValue().write(out);                // write sortValue
+      UTF8.writeString(out, h.getDedupValue());   // write dedupValue
     }
   }
 
   public void readFields(DataInput in) throws IOException {
-    total = in.readLong();
-    top = new Hit[in.readInt()];
+    total = in.readLong();                        // read total hits
+    top = new Hit[in.readInt()];                  // read hits returned
+    Class sortClass = null;
+    if (top.length > 0) {                         // read sort value class
+      try {
+        sortClass = Class.forName(UTF8.readString(in));
+      } catch (ClassNotFoundException e) {
+        throw new IOException(e.toString());
+      }
+    }
+
     for (int i = 0; i < top.length; i++) {
-      top[i] = new Hit();
-      top[i].readFields(in);
+      int indexDocNo = in.readInt();              // read indexDocNo
+
+      WritableComparable sortValue = null;
+      try {
+        sortValue = (WritableComparable)sortClass.newInstance();
+      } catch (Exception e) {
+        throw new IOException(e.toString());
+      }
+      sortValue.readFields(in);                   // read sortValue
+
+      String dedupValue = UTF8.readString(in);    // read dedupValue
+
+      top[i] = new Hit(indexDocNo, sortValue, dedupValue);
     }
   }
 

Modified: incubator/nutch/trunk/src/java/org/apache/nutch/searcher/IndexSearcher.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/java/org/apache/nutch/searcher/IndexSearcher.java?rev=168178&r1=168177&r2=168178&view=diff
==============================================================================
--- incubator/nutch/trunk/src/java/org/apache/nutch/searcher/IndexSearcher.java (original)
+++ incubator/nutch/trunk/src/java/org/apache/nutch/searcher/IndexSearcher.java Wed May  4 12:57:20 2005
@@ -28,6 +28,7 @@
 import org.apache.lucene.search.MultiSearcher;
 import org.apache.lucene.search.TopDocs;
 import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.search.FieldDoc;
 import org.apache.lucene.search.FieldCache;
 
 import org.apache.lucene.document.Document;
@@ -46,9 +47,8 @@
 public class IndexSearcher implements Searcher, HitDetailer {
 
   private org.apache.lucene.search.Searcher luceneSearcher;
+  private org.apache.lucene.index.IndexReader reader;
 
-  private String[] sites;
-  
   private LuceneQueryOptimizer optimizer = new LuceneQueryOptimizer
     (NutchConf.get().getInt("searcher.filter.cache.size", 16),
      NutchConf.get().getFloat("searcher.filter.cache.threshold", 0.05f));
@@ -70,18 +70,23 @@
   }
 
   private void init(IndexReader reader) throws IOException {
-    this.sites = FieldCache.DEFAULT.getStrings(reader, "site");
+    this.reader = reader;
     this.luceneSearcher = new org.apache.lucene.search.IndexSearcher(reader);
     this.luceneSearcher.setSimilarity(new NutchSimilarity());
   }
 
-  public Hits search(Query query, int numHits) throws IOException {
+  public Hits search(Query query, int numHits,
+                     String dedupField, String sortField, boolean reverse)
+
+    throws IOException {
 
     org.apache.lucene.search.BooleanQuery luceneQuery =
       QueryFilters.filter(query);
     
     return translateHits
-      (optimizer.optimize(luceneQuery, luceneSearcher, numHits));
+      (optimizer.optimize(luceneQuery, luceneSearcher, numHits,
+                          sortField, reverse),
+       dedupField, sortField);
   }
 
   public String getExplanation(Query query, Hit hit) throws IOException {
@@ -113,13 +118,40 @@
     return results;
   }
 
-  private Hits translateHits(TopDocs topDocs) throws IOException {
+  private Hits translateHits(TopDocs topDocs,
+                             String dedupField, String sortField)
+    throws IOException {
+
+    String[] dedupValues = null;
+    if (dedupField != null) 
+      dedupValues = FieldCache.DEFAULT.getStrings(reader, dedupField);
+
     ScoreDoc[] scoreDocs = topDocs.scoreDocs;
     int length = scoreDocs.length;
     Hit[] hits = new Hit[length];
     for (int i = 0; i < length; i++) {
+      
       int doc = scoreDocs[i].doc;
-      hits[i] = new Hit(doc, scoreDocs[i].score, sites[doc]);
+      
+      WritableComparable sortValue;               // convert value to writable
+      if (sortField == null) {
+        sortValue = new FloatWritable(scoreDocs[i].score);
+      } else {
+        Object raw = ((FieldDoc)scoreDocs[i]).fields[0];
+        if (raw instanceof Integer) {
+          sortValue = new IntWritable(((Integer)raw).intValue());
+        } else if (raw instanceof Float) {
+          sortValue = new FloatWritable(((Float)raw).floatValue());
+        } else if (raw instanceof String) {
+          sortValue = new UTF8((String)raw);
+        } else {
+          throw new RuntimeException("Unknown sort value type!");
+        }
+      }
+
+      String dedupValue = dedupValues == null ? null : dedupValues[doc];
+
+      hits[i] = new Hit(doc, sortValue, dedupValue);
     }
     return new Hits(topDocs.totalHits, hits);
   }

Modified: incubator/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java?rev=168178&r1=168177&r2=168178&view=diff
==============================================================================
--- incubator/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java (original)
+++ incubator/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java Wed May  4 12:57:20 2005
@@ -23,6 +23,7 @@
 import org.apache.lucene.search.QueryFilter;
 import org.apache.lucene.search.Filter;
 import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.search.Sort;
 
 import java.util.LinkedHashMap;
 import java.util.Map;
@@ -54,7 +55,8 @@
   }
 
   public TopDocs optimize(BooleanQuery original,
-                          Searcher searcher, int numHits)
+                          Searcher searcher, int numHits,
+                          String sortField, boolean reverse)
     throws IOException {
 
     BooleanQuery query = new BooleanQuery();
@@ -89,6 +91,11 @@
       }        
     }
 
-    return searcher.search(query, filter, numHits);
+    if (sortField == null && !reverse) {
+      return searcher.search(query, filter, numHits);
+    } else {
+      return searcher.search(query, filter, numHits,
+                             new Sort(sortField, reverse));
+    }
   }
 }

Modified: incubator/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java?rev=168178&r1=168177&r2=168178&view=diff
==============================================================================
--- incubator/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java (original)
+++ incubator/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java Wed May  4 12:57:20 2005
@@ -32,7 +32,8 @@
  * @version $Id: NutchBean.java,v 1.19 2005/02/07 19:10:08 cutting Exp $
  */   
 public class NutchBean
-  implements Searcher, HitDetailer, HitSummarizer, HitContent {
+  implements Searcher, HitDetailer, HitSummarizer, HitContent,
+             DistributedSearch.Protocol {
 
   public static final Logger LOG =
     LogFormatter.getLogger("org.apache.nutch.searcher.NutchBean");
@@ -131,54 +132,83 @@
   }
 
   public Hits search(Query query, int numHits) throws IOException {
-    return searcher.search(query, numHits);
+    return search(query, numHits, null, null, false);
   }
   
-  private class SiteHits extends ArrayList {
+  public Hits search(Query query, int numHits,
+                     String dedupField, String sortField, boolean reverse)
+    throws IOException {
+
+    return searcher.search(query, numHits, dedupField, sortField, reverse);
+  }
+  
+  private class DupHits extends ArrayList {
     private boolean maxSizeExceeded;
   }
 
-  /**
-   * Search for pages matching a query, eliminating excessive hits from sites.
-   * Hits for a site in excess of <code>maxHitsPerSite</code> are removed from
-   * the results.  The remaining hits for such sites have {@link
-   * Hit#moreFromSiteExcluded()} set.
-   * <p>
-   * If maxHitsPerSite is zero then all hits are returned.
+  /** Search for pages matching a query, eliminating excessive hits with
+   * matching values for a named field.  Hits after the first
+   * <code>maxHitsPerDup</code> are removed from results.  The remaining hits
+   * have {@link Hit#moreFromDupExcluded()} set.  <p> If maxHitsPerDup is zero
+   * then all hits are returned.
+   * 
+   * @param query query
+   * @param numHits number of requested hits
+   * @param maxHitsPerDup the maximum hits returned with matching values, or zero
+   * @param dedupField field name to check for duplicates
+   * @return Hits the matching hits
+   * @throws IOException
+   */
+  public Hits search(Query query, int numHits,
+                     int maxHitsPerDup, String dedupField)
+       throws IOException {
+    return search(query, numHits, maxHitsPerDup, dedupField, null, false);
+  }
+  /** Search for pages matching a query, eliminating excessive hits with
+   * matching values for a named field.  Hits after the first
+   * <code>maxHitsPerDup</code> are removed from results.  The remaining hits
+   * have {@link Hit#moreFromDupExcluded()} set.  <p> If maxHitsPerDup is zero
+   * then all hits are returned.
    * 
    * @param query query
    * @param numHits number of requested hits
-   * @param maxHitsPerSite the maximum hits returned per site, or zero
+   * @param maxHitsPerDup the maximum hits returned with matching values, or zero
+   * @param dedupField field name to check for duplicates
    * @return Hits the matching hits
    * @throws IOException
    */
-  public Hits search(Query query, int numHits, int maxHitsPerSite)
+  public Hits search(Query query, int numHits,
+                     int maxHitsPerDup, String dedupField,
+                     String sortField, boolean reverse)
        throws IOException {
-    if (maxHitsPerSite <= 0)                      // disable site checking
-      return searcher.search(query, numHits);
+    if (maxHitsPerDup <= 0)                      // disable dup checking
+      return search(query, numHits, dedupField, sortField, reverse);
 
     int numHitsRaw = (int)(numHits * RAW_HITS_FACTOR);
     LOG.info("searching for "+numHitsRaw+" raw hits");
-    Hits hits = searcher.search(query, numHitsRaw);
+    Hits hits = searcher.search(query, numHitsRaw,
+                                dedupField, sortField, reverse);
     long total = hits.getTotal();
-    Map siteToHits = new HashMap();
+    Map dupToHits = new HashMap();
     List resultList = new ArrayList();
     Set seen = new HashSet();
-    List excludedSites = new ArrayList();
+    List excludedValues = new ArrayList();
     boolean totalIsExact = true;
     for (int rawHitNum = 0; rawHitNum < hits.getTotal(); rawHitNum++) {
       // get the next raw hit
       if (rawHitNum >= hits.getLength()) {
-        // optimize query by prohibiting more matches on some excluded sites
+        // optimize query by prohibiting more matches on some excluded values
         Query optQuery = (Query)query.clone();
-        for (int i = 0; i < excludedSites.size(); i++) {
+        for (int i = 0; i < excludedValues.size(); i++) {
           if (i == MAX_PROHIBITED_TERMS)
             break;
-          optQuery.addProhibitedTerm(((String)excludedSites.get(i)), "site");
+          optQuery.addProhibitedTerm(((String)excludedValues.get(i)),
+                                     dedupField);
         }
         numHitsRaw = (int)(numHitsRaw * RAW_HITS_FACTOR);
         LOG.info("re-searching for "+numHitsRaw+" raw hits, query: "+optQuery);
-        hits = searcher.search(optQuery, numHitsRaw);
+        hits = searcher.search(optQuery, numHitsRaw,
+                               dedupField, sortField, reverse);
         LOG.info("found "+hits.getTotal()+" raw hits");
         rawHitNum = -1;
         continue;
@@ -189,28 +219,28 @@
         continue;
       seen.add(hit);
       
-      // get site hits for its site
-      String site = hit.getSite();
-      SiteHits siteHits = (SiteHits)siteToHits.get(site);
-      if (siteHits == null)
-        siteToHits.put(site, siteHits = new SiteHits());
-
-      // does this hit exceed maxHitsPerSite?
-      if (siteHits.size() == maxHitsPerSite) {    // yes -- ignore the hit
-        if (!siteHits.maxSizeExceeded) {
-
-          // mark prior hits with moreFromSiteExcluded
-          for (int i = 0; i < siteHits.size(); i++) {
-            ((Hit)siteHits.get(i)).setMoreFromSiteExcluded(true);
+      // get dup hits for its value
+      String value = hit.getDedupValue();
+      DupHits dupHits = (DupHits)dupToHits.get(value);
+      if (dupHits == null)
+        dupToHits.put(value, dupHits = new DupHits());
+
+      // does this hit exceed maxHitsPerDup?
+      if (dupHits.size() == maxHitsPerDup) {      // yes -- ignore the hit
+        if (!dupHits.maxSizeExceeded) {
+
+          // mark prior hits with moreFromDupExcluded
+          for (int i = 0; i < dupHits.size(); i++) {
+            ((Hit)dupHits.get(i)).setMoreFromDupExcluded(true);
           }
-          siteHits.maxSizeExceeded = true;
+          dupHits.maxSizeExceeded = true;
 
-          excludedSites.add(site);                // exclude site
+          excludedValues.add(value);              // exclude dup
         }
         totalIsExact = false;
       } else {                                    // no -- collect the hit
         resultList.add(hit);
-        siteHits.add(hit);
+        dupHits.add(hit);
 
         // are we done?
         // we need to find one more than asked for, so that we can tell if

Modified: incubator/nutch/trunk/src/java/org/apache/nutch/searcher/OpenSearchServlet.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/java/org/apache/nutch/searcher/OpenSearchServlet.java?rev=168178&r1=168177&r2=168178&view=diff
==============================================================================
--- incubator/nutch/trunk/src/java/org/apache/nutch/searcher/OpenSearchServlet.java (original)
+++ incubator/nutch/trunk/src/java/org/apache/nutch/searcher/OpenSearchServlet.java Wed May  4 12:57:20 2005
@@ -98,13 +98,18 @@
     if (hitsPerSiteString != null)
       hitsPerSite = Integer.parseInt(hitsPerSiteString);
 
+    String sort = request.getParameter("sort");
+    boolean reverse =
+      sort!=null && "true".equals(request.getParameter("reverse"));
+
     Query query = Query.parse(queryString);
     bean.LOG.info("query: " + queryString);
 
     // execute the query
     Hits hits;
     try {
-      hits = bean.search(query, start + hitsPerPage, hitsPerSite);
+      hits = bean.search(query, start + hitsPerPage, hitsPerSite, "site",
+                         sort, reverse);
     } catch (IOException e) {
       bean.LOG.log(Level.WARNING, "Search Error", e);
       hits = new Hits(0,new Hit[0]);	
@@ -122,6 +127,9 @@
 
     String requestUrl = request.getRequestURL().toString();
     String base = requestUrl.substring(0, requestUrl.lastIndexOf('/'));
+    String params = "&hitsPerPage="+hitsPerPage
+      +(sort==null ? "" : "&sort="+sort+(reverse?"&reverse=true":""));
+      
 
     try {
       DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
@@ -130,7 +138,8 @@
  
       Element rss = addNode(doc, doc, "rss");
       addAttribute(doc, rss, "version", "2.0");
-      addAttribute(doc, rss, "xmlns:os", (String)NS_MAP.get("opensearch"));
+      addAttribute(doc, rss, "xmlns:opensearch",
+                   (String)NS_MAP.get("opensearch"));
       addAttribute(doc, rss, "xmlns:nutch", (String)NS_MAP.get("nutch"));
 
       Element channel = addNode(doc, rss, "channel");
@@ -142,8 +151,8 @@
               base+"/search.jsp"
               +"?query="+urlQuery
               +"&start="+start
-              +"&hitsPerPage="+hitsPerPage
-              +"&hitsPerSite="+hitsPerSite);
+              +"&hitsPerSite="+hitsPerSite
+              +params);
 
       addNode(doc, channel, "opensearch", "totalResults", ""+hits.getTotal());
       addNode(doc, channel, "opensearch", "startIndex", ""+start);
@@ -157,15 +166,15 @@
         addNode(doc, channel, "nutch", "nextPage", requestUrl
                 +"?query="+urlQuery
                 +"&start="+end
-                +"&hitsPerPage="+hitsPerPage
-                +"&hitsPerSite="+hitsPerSite);
+                +"&hitsPerSite="+hitsPerSite
+                +params);
       }
 
       if ((!hits.totalIsExact() && (hits.getLength() <= start+hitsPerPage))) {
         addNode(doc, channel, "nutch", "showAllHits", requestUrl
                 +"?query="+urlQuery
-                +"&hitsPerPage="+hitsPerPage
-                +"&hitsPerSite="+0);
+                +"&hitsPerSite="+0
+                +params);
       }
 
       for (int i = 0; i < length; i++) {
@@ -184,18 +193,19 @@
         addNode(doc, item, "description", summaries[i]);
         addNode(doc, item, "link", url);
 
-        addNode(doc, item, "nutch", "site", hit.getSite());
+        addNode(doc, item, "nutch", "site", hit.getDedupValue());
 
         addNode(doc, item, "nutch", "cache", base+"/cached.jsp?"+id);
         addNode(doc, item, "nutch", "explain", base+"/explain.jsp?"+id
                 +"&query="+urlQuery);
 
-        if (hit.moreFromSiteExcluded()) {
+        if (hit.moreFromDupExcluded()) {
           addNode(doc, item, "nutch", "moreFromSite", requestUrl
                   +"?query="
-                  +URLEncoder.encode("site:"+hit.getSite()+" "+queryString,
-                                     "UTF-8")
-                  +"&hitsPerPage="+hitsPerPage+"&hitsPerSite="+0);
+                  +URLEncoder.encode("site:"+hit.getDedupValue()
+                                     +" "+queryString, "UTF-8")
+                  +"&hitsPerSite="+0
+                  +params);
         }
 
         for (int j = 0; j < detail.getLength(); j++) { // add all from detail

Modified: incubator/nutch/trunk/src/java/org/apache/nutch/searcher/Searcher.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/java/org/apache/nutch/searcher/Searcher.java?rev=168178&r1=168177&r2=168178&view=diff
==============================================================================
--- incubator/nutch/trunk/src/java/org/apache/nutch/searcher/Searcher.java (original)
+++ incubator/nutch/trunk/src/java/org/apache/nutch/searcher/Searcher.java Wed May  4 12:57:20 2005
@@ -21,7 +21,9 @@
 /** Service that searches. */
 public interface Searcher {
   /** Return the top-scoring hits for a query. */
-  Hits search(Query query, int numHits) throws IOException;
+  Hits search(Query query, int numHits,
+              String dedupField, String sortField, boolean reverse)
+    throws IOException;
 
   /** Return an HTML-formatted explanation of how a query scored. */
   String getExplanation(Query query, Hit hit) throws IOException;

Modified: incubator/nutch/trunk/src/java/org/apache/nutch/servlet/Cached.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/java/org/apache/nutch/servlet/Cached.java?rev=168178&r1=168177&r2=168178&view=diff
==============================================================================
--- incubator/nutch/trunk/src/java/org/apache/nutch/servlet/Cached.java (original)
+++ incubator/nutch/trunk/src/java/org/apache/nutch/servlet/Cached.java Wed May  4 12:57:20 2005
@@ -69,7 +69,7 @@
     bean.LOG.info("request from " + request.getRemoteAddr());
 
     Hit hit = new Hit(Integer.parseInt(request.getParameter("idx")),
-                    Integer.parseInt(request.getParameter("id")), 0.0f, null);
+                      Integer.parseInt(request.getParameter("id")));
     HitDetails details = bean.getDetails(hit);
 
     // raw bytes

Modified: incubator/nutch/trunk/src/web/jsp/anchors.jsp
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/web/jsp/anchors.jsp?rev=168178&r1=168177&r2=168178&view=diff
==============================================================================
--- incubator/nutch/trunk/src/web/jsp/anchors.jsp (original)
+++ incubator/nutch/trunk/src/web/jsp/anchors.jsp Wed May  4 12:57:20 2005
@@ -15,7 +15,7 @@
   request.setCharacterEncoding("UTF-8");
   bean.LOG.info("anchors request from " + request.getRemoteAddr());
   Hit hit = new Hit(Integer.parseInt(request.getParameter("idx")),
-                    Integer.parseInt(request.getParameter("id")), 0.0f, null);
+                    Integer.parseInt(request.getParameter("id")));
   HitDetails details = bean.getDetails(hit);
   String language =
     ResourceBundle.getBundle("org.nutch.jsp.anchors", request.getLocale())

Modified: incubator/nutch/trunk/src/web/jsp/cached.jsp
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/web/jsp/cached.jsp?rev=168178&r1=168177&r2=168178&view=diff
==============================================================================
--- incubator/nutch/trunk/src/web/jsp/cached.jsp (original)
+++ incubator/nutch/trunk/src/web/jsp/cached.jsp Wed May  4 12:57:20 2005
@@ -11,7 +11,7 @@
   NutchBean bean = NutchBean.get(application);
   bean.LOG.info("cache request from " + request.getRemoteAddr());
   Hit hit = new Hit(Integer.parseInt(request.getParameter("idx")),
-                    Integer.parseInt(request.getParameter("id")), 0.0f, null);
+                    Integer.parseInt(request.getParameter("id")));
   HitDetails details = bean.getDetails(hit);
   String id = "idx=" + hit.getIndexNo() + "&id=" + hit.getIndexDocNo();
 

Modified: incubator/nutch/trunk/src/web/jsp/explain.jsp
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/web/jsp/explain.jsp?rev=168178&r1=168177&r2=168178&view=diff
==============================================================================
--- incubator/nutch/trunk/src/web/jsp/explain.jsp (original)
+++ incubator/nutch/trunk/src/web/jsp/explain.jsp Wed May  4 12:57:20 2005
@@ -13,7 +13,7 @@
   request.setCharacterEncoding("UTF-8");
   bean.LOG.info("explain request from " + request.getRemoteAddr());
   Hit hit = new Hit(Integer.parseInt(request.getParameter("idx")),
-                    Integer.parseInt(request.getParameter("id")), 0.0f, null);
+                    Integer.parseInt(request.getParameter("id")));
   HitDetails details = bean.getDetails(hit);
   Query query = Query.parse(request.getParameter("query"));
   String language =

Modified: incubator/nutch/trunk/src/web/jsp/search.jsp
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/web/jsp/search.jsp?rev=168178&r1=168177&r2=168178&view=diff
==============================================================================
--- incubator/nutch/trunk/src/web/jsp/search.jsp (original)
+++ incubator/nutch/trunk/src/web/jsp/search.jsp Wed May  4 12:57:20 2005
@@ -155,7 +155,7 @@
     // position.... this is good, bad?... ugly?....
    Hits hits;
    try{
-     hits = bean.search(query, start + hitsToRetrieve, hitsPerSite);
+     hits = bean.search(query, start + hitsToRetrieve, hitsPerSite, "site");
    } catch (IOException e){
      hits = new Hits(0,new Hit[0]);	
    }
@@ -214,13 +214,13 @@
     (<a href="../cached.jsp?<%=id%>"><i18n:message key="cached"/></a>)
     (<a href="../explain.jsp?<%=id%>&query=<%=URLEncoder.encode(queryString)%>"><i18n:message key="explain"/></a>)
     (<a href="../anchors.jsp?<%=id%>"><i18n:message key="anchors"/></a>)
-    <% if (hit.moreFromSiteExcluded()) {
+    <% if (hit.moreFromDupExcluded()) {
     String more =
-    "query="+URLEncoder.encode("site:"+hit.getSite()+" "+queryString)
+    "query="+URLEncoder.encode("site:"+hit.getDedupValue()+" "+queryString)
     +"&hitsPerPage="+hitsPerPage+"&hitsPerSite="+0
     +"&clustering="+clustering;%>
     (<a href="../search.jsp?<%=more%>"><i18n:message key="moreFrom"/>
-     <%=hit.getSite()%></a>)
+     <%=hit.getDedupValue()%></a>)
     <% } %>
     <br><br>
 <% } %>

Modified: incubator/nutch/trunk/src/web/jsp/text.jsp
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/web/jsp/text.jsp?rev=168178&r1=168177&r2=168178&view=diff
==============================================================================
--- incubator/nutch/trunk/src/web/jsp/text.jsp (original)
+++ incubator/nutch/trunk/src/web/jsp/text.jsp Wed May  4 12:57:20 2005
@@ -17,7 +17,7 @@
   bean.LOG.info("text request from " + request.getRemoteAddr());
 
   Hit hit = new Hit(Integer.parseInt(request.getParameter("idx")),
-                    Integer.parseInt(request.getParameter("id")), 0.0f, null);
+                    Integer.parseInt(request.getParameter("id")));
   HitDetails details = bean.getDetails(hit);
 
   String text = bean.getParseText(details).getText();