You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by cu...@apache.org on 2005/05/04 21:57:21 UTC
svn commit: r168178 - in /incubator/nutch/trunk: ./
src/java/org/apache/nutch/ipc/ src/java/org/apache/nutch/searcher/
src/java/org/apache/nutch/servlet/ src/web/jsp/
Author: cutting
Date: Wed May 4 12:57:20 2005
New Revision: 168178
URL: http://svn.apache.org/viewcvs?rev=168178&view=rev
Log:
Add result sorting & deduping by fields other than site.
Modified:
incubator/nutch/trunk/CHANGES.txt
incubator/nutch/trunk/src/java/org/apache/nutch/ipc/RPC.java
incubator/nutch/trunk/src/java/org/apache/nutch/searcher/DistributedSearch.java
incubator/nutch/trunk/src/java/org/apache/nutch/searcher/Hit.java
incubator/nutch/trunk/src/java/org/apache/nutch/searcher/Hits.java
incubator/nutch/trunk/src/java/org/apache/nutch/searcher/IndexSearcher.java
incubator/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java
incubator/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java
incubator/nutch/trunk/src/java/org/apache/nutch/searcher/OpenSearchServlet.java
incubator/nutch/trunk/src/java/org/apache/nutch/searcher/Searcher.java
incubator/nutch/trunk/src/java/org/apache/nutch/servlet/Cached.java
incubator/nutch/trunk/src/web/jsp/anchors.jsp
incubator/nutch/trunk/src/web/jsp/cached.jsp
incubator/nutch/trunk/src/web/jsp/explain.jsp
incubator/nutch/trunk/src/web/jsp/search.jsp
incubator/nutch/trunk/src/web/jsp/text.jsp
Modified: incubator/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/CHANGES.txt?rev=168178&r1=168177&r2=168178&view=diff
==============================================================================
--- incubator/nutch/trunk/CHANGES.txt (original)
+++ incubator/nutch/trunk/CHANGES.txt Wed May 4 12:57:20 2005
@@ -73,6 +73,9 @@
15. Make query boosts for host, title, anchor and phrase matches
configurable. (Piotr Kosiorowski via cutting, 20050419)
+16. Add support for sorting search results and search-time deduping by
+ fields other than site.
+
Release 0.6
Modified: incubator/nutch/trunk/src/java/org/apache/nutch/ipc/RPC.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/java/org/apache/nutch/ipc/RPC.java?rev=168178&r1=168177&r2=168178&view=diff
==============================================================================
--- incubator/nutch/trunk/src/java/org/apache/nutch/ipc/RPC.java (original)
+++ incubator/nutch/trunk/src/java/org/apache/nutch/ipc/RPC.java Wed May 4 12:57:20 2005
@@ -380,14 +380,14 @@
public Writable call(Writable param) throws IOException {
try {
Invocation call = (Invocation)param;
- if (verbose) LOG.info("Call: " + call);
+ if (verbose) log("Call: " + call);
Method method =
implementation.getMethod(call.getMethodName(),
call.getParameterClasses());
Object value = method.invoke(instance, call.getParameters());
- if (verbose) LOG.info("Return: " + value);
+ if (verbose) log("Return: "+value);
return new ObjectWritable(method.getReturnType(), value);
@@ -407,6 +407,12 @@
}
}
};
+ }
+
+ private static void log(String value) {
+ if (value!= null && value.length() > 55)
+ value = value.substring(0, 55)+"...";
+ LOG.info(value);
}
}
Modified: incubator/nutch/trunk/src/java/org/apache/nutch/searcher/DistributedSearch.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/java/org/apache/nutch/searcher/DistributedSearch.java?rev=168178&r1=168177&r2=168178&view=diff
==============================================================================
--- incubator/nutch/trunk/src/java/org/apache/nutch/searcher/DistributedSearch.java (original)
+++ incubator/nutch/trunk/src/java/org/apache/nutch/searcher/DistributedSearch.java Wed May 4 12:57:20 2005
@@ -125,7 +125,8 @@
GET_SEGMENTS = Protocol.class.getMethod
("getSegmentNames", new Class[] {});
SEARCH = Protocol.class.getMethod
- ("search", new Class[] { Query.class, Integer.TYPE});
+ ("search", new Class[] { Query.class, Integer.TYPE, String.class,
+ String.class, Boolean.TYPE});
DETAILS = Protocol.class.getMethod
("getDetails", new Class[] { Hit.class});
SUMMARY = Protocol.class.getMethod
@@ -179,30 +180,50 @@
segmentToAddress.keySet().toArray(new String[segmentToAddress.size()]);
}
- public Hits search(Query query, int numHits) throws IOException {
+ public Hits search(final Query query, final int numHits,
+ final String dedupField, final String sortField,
+ final boolean reverse) throws IOException {
long totalHits = 0;
Hits[] segmentHits = new Hits[liveAddresses.length];
- Object[][] params = new Object[liveAddresses.length][2];
+ Object[][] params = new Object[liveAddresses.length][5];
for (int i = 0; i < params.length; i++) {
params[i][0] = query;
params[i][1] = new Integer(numHits);
+ params[i][2] = dedupField;
+ params[i][3] = sortField;
+ params[i][4] = Boolean.valueOf(reverse);
}
Hits[] results = (Hits[])RPC.call(SEARCH, params, liveAddresses);
- TreeSet queue = new TreeSet(); // cull top hits from results
- Comparable minValue = null;
+ TreeSet queue; // cull top hits from results
+
+ if (sortField == null || reverse) {
+ queue = new TreeSet(new Comparator() {
+ public int compare(Object o1, Object o2) {
+ return ((Comparable)o2).compareTo(o1); // reverse natural order
+ }
+ });
+ } else {
+ queue = new TreeSet();
+ }
+
+ Comparable maxValue = null;
for (int i = 0; i < results.length; i++) {
Hits hits = results[i];
if (hits == null) continue;
totalHits += hits.getTotal();
for (int j = 0; j < hits.getLength(); j++) {
Hit h = hits.getHit(j);
- if (minValue == null || h.getSite().compareTo(minValue) >= 0) {
- queue.add(new Hit(i, h.getIndexDocNo(), h.getSite()));
+ if (maxValue == null ||
+ ((reverse || sortField == null)
+ ? h.getSortValue().compareTo(maxValue) >= 0
+ : h.getSortValue().compareTo(maxValue) <= 0)) {
+ queue.add(new Hit(i, h.getIndexDocNo(),
+ h.getSortValue(), h.getDedupValue()));
if (queue.size() > numHits) { // if hit queue overfull
queue.remove(queue.last()); // remove lowest in hit queue
- minValue = ((Hit)queue.last()).getSite(); // reset minValue
+ maxValue = ((Hit)queue.last()).getSortValue(); // reset maxValue
}
}
}
@@ -297,7 +318,7 @@
Client client = new Client(addresses);
//client.setTimeout(Integer.MAX_VALUE);
- Hits hits = client.search(query, 10);
+ Hits hits = client.search(query, 10, null, null, false);
System.out.println("Total hits: " + hits.getTotal());
for (int i = 0; i < hits.getLength(); i++) {
System.out.println(" "+i+" "+ client.getDetails(hits.getHit(i)));
Modified: incubator/nutch/trunk/src/java/org/apache/nutch/searcher/Hit.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/java/org/apache/nutch/searcher/Hit.java?rev=168178&r1=168177&r2=168178&view=diff
==============================================================================
--- incubator/nutch/trunk/src/java/org/apache/nutch/searcher/Hit.java (original)
+++ incubator/nutch/trunk/src/java/org/apache/nutch/searcher/Hit.java Wed May 4 12:57:20 2005
@@ -21,6 +21,7 @@
import java.io.IOException;
import org.apache.nutch.io.Writable;
+import org.apache.nutch.io.WritableComparable;
import org.apache.nutch.io.UTF8;
import java.util.logging.Logger;
@@ -33,25 +34,25 @@
private int indexNo; // index id
private int indexDocNo; // index-relative id
- private float score; // its score for the query
- private String site; // its website name
- private boolean moreFromSiteExcluded;
+ private WritableComparable sortValue; // value sorted on
+ private String dedupValue; // value to dedup on
+ private boolean moreFromDupExcluded;
public Hit() {}
- public Hit(int indexNo, int indexDocNo, float score, String site) {
- this(indexDocNo, score, site);
+ public Hit(int indexNo, int indexDocNo) {
+ this(indexNo, indexDocNo, null, null);
+ }
+ public Hit(int indexNo, int indexDocNo,
+ WritableComparable sortValue,
+ String dedupValue) {
+ this(indexDocNo, sortValue, dedupValue);
this.indexNo = indexNo;
}
- public Hit(int indexDocNo, float score, String site) {
+ public Hit(int indexDocNo, WritableComparable sortValue, String dedupValue) {
this.indexDocNo = indexDocNo;
- this.score = score;
- // 20041006, xing
- // The following fixes a bug that causes cached.jsp, text.jsp, etc.,
- // to fail in distributed search. "Release 0.6, note 14" in CHANGES.txt
- if (site == null)
- site = "";
- this.site = site;
+ this.sortValue = sortValue;
+ this.dedupValue = dedupValue == null ? "" : dedupValue;
}
/** Return the index number that this hit came from. */
@@ -61,31 +62,19 @@
/** Return the document number of this hit within an index. */
public int getIndexDocNo() { return indexDocNo; }
- /** Return the degree to which this document matched the query. */
- public float getScore() { return score; }
-
- /** Return the name of this this document's website. */
- public String getSite() { return site; }
+ /** Return the value of the field that hits are sorted on. */
+ public WritableComparable getSortValue() { return sortValue; }
- /** True iff other, lower-scoring, hits from the same site have been excluded
- * from the list which contains this hit.. */
- public boolean moreFromSiteExcluded() { return moreFromSiteExcluded; }
-
- /** True iff other, lower-scoring, hits from the same site have been excluded
- * from the list which contains this hit.. */
- public void setMoreFromSiteExcluded(boolean more){moreFromSiteExcluded=more;}
+ /** Return the value of the field that hits should be deduplicated on. */
+ public String getDedupValue() { return dedupValue; }
- public void write(DataOutput out) throws IOException {
- out.writeInt(indexDocNo);
- out.writeFloat(score);
- UTF8.writeString(out, site);
- }
-
- public void readFields(DataInput in) throws IOException {
- indexDocNo = in.readInt();
- score = in.readFloat();
- site = UTF8.readString(in);
- }
+ /** True iff other, lower-scoring, hits with the same dedup value have been
+ * excluded from the list which contains this hit.. */
+ public boolean moreFromDupExcluded() { return moreFromDupExcluded; }
+
+ /** True iff other, lower-scoring, hits with the same deup value have been
+ * excluded from the list which contains this hit.. */
+ public void setMoreFromDupExcluded(boolean more){moreFromDupExcluded=more;}
/** Display as a string. */
public String toString() {
@@ -106,14 +95,22 @@
public int compareTo(Object o) {
Hit other = (Hit)o;
- if (other.score > this.score) { // prefer higher scores
- return 1;
- } else if (other.score < this.score) {
- return -1;
+ int compare = sortValue.compareTo(other.sortValue);
+ if (compare != 0) {
+ return compare; // use sortValue
} else if (other.indexNo != this.indexNo) {
return other.indexNo - this.indexNo; // prefer later indexes
} else {
return other.indexDocNo - this.indexDocNo; // prefer later docs
}
}
+
+ public void write(DataOutput out) throws IOException {
+ out.writeInt(indexDocNo);
+ }
+
+ public void readFields(DataInput in) throws IOException {
+ indexDocNo = in.readInt();
+ }
+
}
Modified: incubator/nutch/trunk/src/java/org/apache/nutch/searcher/Hits.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/java/org/apache/nutch/searcher/Hits.java?rev=168178&r1=168177&r2=168178&view=diff
==============================================================================
--- incubator/nutch/trunk/src/java/org/apache/nutch/searcher/Hits.java (original)
+++ incubator/nutch/trunk/src/java/org/apache/nutch/searcher/Hits.java Wed May 4 12:57:20 2005
@@ -21,6 +21,8 @@
import java.io.IOException;
import org.apache.nutch.io.Writable;
+import org.apache.nutch.io.WritableComparable;
+import org.apache.nutch.io.UTF8;
import java.util.logging.Logger;
import org.apache.nutch.util.LogFormatter;
@@ -69,19 +71,45 @@
public void write(DataOutput out) throws IOException {
- out.writeLong(total);
- out.writeInt(top.length);
+ out.writeLong(total); // write total hits
+ out.writeInt(top.length); // write hits returned
+ if (top.length > 0) // write sort value class
+ UTF8.writeString(out, top[0].getSortValue().getClass().getName());
+
for (int i = 0; i < top.length; i++) {
- top[i].write(out);
+ Hit h = top[i];
+ out.writeInt(h.getIndexDocNo()); // write indexDocNo
+ h.getSortValue().write(out); // write sortValue
+ UTF8.writeString(out, h.getDedupValue()); // write dedupValue
}
}
public void readFields(DataInput in) throws IOException {
- total = in.readLong();
- top = new Hit[in.readInt()];
+ total = in.readLong(); // read total hits
+ top = new Hit[in.readInt()]; // read hits returned
+ Class sortClass = null;
+ if (top.length > 0) { // read sort value class
+ try {
+ sortClass = Class.forName(UTF8.readString(in));
+ } catch (ClassNotFoundException e) {
+ throw new IOException(e.toString());
+ }
+ }
+
for (int i = 0; i < top.length; i++) {
- top[i] = new Hit();
- top[i].readFields(in);
+ int indexDocNo = in.readInt(); // read indexDocNo
+
+ WritableComparable sortValue = null;
+ try {
+ sortValue = (WritableComparable)sortClass.newInstance();
+ } catch (Exception e) {
+ throw new IOException(e.toString());
+ }
+ sortValue.readFields(in); // read sortValue
+
+ String dedupValue = UTF8.readString(in); // read dedupValue
+
+ top[i] = new Hit(indexDocNo, sortValue, dedupValue);
}
}
Modified: incubator/nutch/trunk/src/java/org/apache/nutch/searcher/IndexSearcher.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/java/org/apache/nutch/searcher/IndexSearcher.java?rev=168178&r1=168177&r2=168178&view=diff
==============================================================================
--- incubator/nutch/trunk/src/java/org/apache/nutch/searcher/IndexSearcher.java (original)
+++ incubator/nutch/trunk/src/java/org/apache/nutch/searcher/IndexSearcher.java Wed May 4 12:57:20 2005
@@ -28,6 +28,7 @@
import org.apache.lucene.search.MultiSearcher;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.search.FieldDoc;
import org.apache.lucene.search.FieldCache;
import org.apache.lucene.document.Document;
@@ -46,9 +47,8 @@
public class IndexSearcher implements Searcher, HitDetailer {
private org.apache.lucene.search.Searcher luceneSearcher;
+ private org.apache.lucene.index.IndexReader reader;
- private String[] sites;
-
private LuceneQueryOptimizer optimizer = new LuceneQueryOptimizer
(NutchConf.get().getInt("searcher.filter.cache.size", 16),
NutchConf.get().getFloat("searcher.filter.cache.threshold", 0.05f));
@@ -70,18 +70,23 @@
}
private void init(IndexReader reader) throws IOException {
- this.sites = FieldCache.DEFAULT.getStrings(reader, "site");
+ this.reader = reader;
this.luceneSearcher = new org.apache.lucene.search.IndexSearcher(reader);
this.luceneSearcher.setSimilarity(new NutchSimilarity());
}
- public Hits search(Query query, int numHits) throws IOException {
+ public Hits search(Query query, int numHits,
+ String dedupField, String sortField, boolean reverse)
+
+ throws IOException {
org.apache.lucene.search.BooleanQuery luceneQuery =
QueryFilters.filter(query);
return translateHits
- (optimizer.optimize(luceneQuery, luceneSearcher, numHits));
+ (optimizer.optimize(luceneQuery, luceneSearcher, numHits,
+ sortField, reverse),
+ dedupField, sortField);
}
public String getExplanation(Query query, Hit hit) throws IOException {
@@ -113,13 +118,40 @@
return results;
}
- private Hits translateHits(TopDocs topDocs) throws IOException {
+ private Hits translateHits(TopDocs topDocs,
+ String dedupField, String sortField)
+ throws IOException {
+
+ String[] dedupValues = null;
+ if (dedupField != null)
+ dedupValues = FieldCache.DEFAULT.getStrings(reader, dedupField);
+
ScoreDoc[] scoreDocs = topDocs.scoreDocs;
int length = scoreDocs.length;
Hit[] hits = new Hit[length];
for (int i = 0; i < length; i++) {
+
int doc = scoreDocs[i].doc;
- hits[i] = new Hit(doc, scoreDocs[i].score, sites[doc]);
+
+ WritableComparable sortValue; // convert value to writable
+ if (sortField == null) {
+ sortValue = new FloatWritable(scoreDocs[i].score);
+ } else {
+ Object raw = ((FieldDoc)scoreDocs[i]).fields[0];
+ if (raw instanceof Integer) {
+ sortValue = new IntWritable(((Integer)raw).intValue());
+ } else if (raw instanceof Float) {
+ sortValue = new FloatWritable(((Float)raw).floatValue());
+ } else if (raw instanceof String) {
+ sortValue = new UTF8((String)raw);
+ } else {
+ throw new RuntimeException("Unknown sort value type!");
+ }
+ }
+
+ String dedupValue = dedupValues == null ? null : dedupValues[doc];
+
+ hits[i] = new Hit(doc, sortValue, dedupValue);
}
return new Hits(topDocs.totalHits, hits);
}
Modified: incubator/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java?rev=168178&r1=168177&r2=168178&view=diff
==============================================================================
--- incubator/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java (original)
+++ incubator/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java Wed May 4 12:57:20 2005
@@ -23,6 +23,7 @@
import org.apache.lucene.search.QueryFilter;
import org.apache.lucene.search.Filter;
import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.search.Sort;
import java.util.LinkedHashMap;
import java.util.Map;
@@ -54,7 +55,8 @@
}
public TopDocs optimize(BooleanQuery original,
- Searcher searcher, int numHits)
+ Searcher searcher, int numHits,
+ String sortField, boolean reverse)
throws IOException {
BooleanQuery query = new BooleanQuery();
@@ -89,6 +91,11 @@
}
}
- return searcher.search(query, filter, numHits);
+ if (sortField == null && !reverse) {
+ return searcher.search(query, filter, numHits);
+ } else {
+ return searcher.search(query, filter, numHits,
+ new Sort(sortField, reverse));
+ }
}
}
Modified: incubator/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java?rev=168178&r1=168177&r2=168178&view=diff
==============================================================================
--- incubator/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java (original)
+++ incubator/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java Wed May 4 12:57:20 2005
@@ -32,7 +32,8 @@
* @version $Id: NutchBean.java,v 1.19 2005/02/07 19:10:08 cutting Exp $
*/
public class NutchBean
- implements Searcher, HitDetailer, HitSummarizer, HitContent {
+ implements Searcher, HitDetailer, HitSummarizer, HitContent,
+ DistributedSearch.Protocol {
public static final Logger LOG =
LogFormatter.getLogger("org.apache.nutch.searcher.NutchBean");
@@ -131,54 +132,83 @@
}
public Hits search(Query query, int numHits) throws IOException {
- return searcher.search(query, numHits);
+ return search(query, numHits, null, null, false);
}
- private class SiteHits extends ArrayList {
+ public Hits search(Query query, int numHits,
+ String dedupField, String sortField, boolean reverse)
+ throws IOException {
+
+ return searcher.search(query, numHits, dedupField, sortField, reverse);
+ }
+
+ private class DupHits extends ArrayList {
private boolean maxSizeExceeded;
}
- /**
- * Search for pages matching a query, eliminating excessive hits from sites.
- * Hits for a site in excess of <code>maxHitsPerSite</code> are removed from
- * the results. The remaining hits for such sites have {@link
- * Hit#moreFromSiteExcluded()} set.
- * <p>
- * If maxHitsPerSite is zero then all hits are returned.
+ /** Search for pages matching a query, eliminating excessive hits with
+ * matching values for a named field. Hits after the first
+ * <code>maxHitsPerDup</code> are removed from results. The remaining hits
+ * have {@link Hit#moreFromDupExcluded()} set. <p> If maxHitsPerDup is zero
+ * then all hits are returned.
+ *
+ * @param query query
+ * @param numHits number of requested hits
+ * @param maxHitsPerDup the maximum hits returned with matching values, or zero
+ * @param dedupField field name to check for duplicates
+ * @return Hits the matching hits
+ * @throws IOException
+ */
+ public Hits search(Query query, int numHits,
+ int maxHitsPerDup, String dedupField)
+ throws IOException {
+ return search(query, numHits, maxHitsPerDup, dedupField, null, false);
+ }
+ /** Search for pages matching a query, eliminating excessive hits with
+ * matching values for a named field. Hits after the first
+ * <code>maxHitsPerDup</code> are removed from results. The remaining hits
+ * have {@link Hit#moreFromDupExcluded()} set. <p> If maxHitsPerDup is zero
+ * then all hits are returned.
*
* @param query query
* @param numHits number of requested hits
- * @param maxHitsPerSite the maximum hits returned per site, or zero
+ * @param maxHitsPerDup the maximum hits returned with matching values, or zero
+ * @param dedupField field name to check for duplicates
* @return Hits the matching hits
* @throws IOException
*/
- public Hits search(Query query, int numHits, int maxHitsPerSite)
+ public Hits search(Query query, int numHits,
+ int maxHitsPerDup, String dedupField,
+ String sortField, boolean reverse)
throws IOException {
- if (maxHitsPerSite <= 0) // disable site checking
- return searcher.search(query, numHits);
+ if (maxHitsPerDup <= 0) // disable dup checking
+ return search(query, numHits, dedupField, sortField, reverse);
int numHitsRaw = (int)(numHits * RAW_HITS_FACTOR);
LOG.info("searching for "+numHitsRaw+" raw hits");
- Hits hits = searcher.search(query, numHitsRaw);
+ Hits hits = searcher.search(query, numHitsRaw,
+ dedupField, sortField, reverse);
long total = hits.getTotal();
- Map siteToHits = new HashMap();
+ Map dupToHits = new HashMap();
List resultList = new ArrayList();
Set seen = new HashSet();
- List excludedSites = new ArrayList();
+ List excludedValues = new ArrayList();
boolean totalIsExact = true;
for (int rawHitNum = 0; rawHitNum < hits.getTotal(); rawHitNum++) {
// get the next raw hit
if (rawHitNum >= hits.getLength()) {
- // optimize query by prohibiting more matches on some excluded sites
+ // optimize query by prohibiting more matches on some excluded values
Query optQuery = (Query)query.clone();
- for (int i = 0; i < excludedSites.size(); i++) {
+ for (int i = 0; i < excludedValues.size(); i++) {
if (i == MAX_PROHIBITED_TERMS)
break;
- optQuery.addProhibitedTerm(((String)excludedSites.get(i)), "site");
+ optQuery.addProhibitedTerm(((String)excludedValues.get(i)),
+ dedupField);
}
numHitsRaw = (int)(numHitsRaw * RAW_HITS_FACTOR);
LOG.info("re-searching for "+numHitsRaw+" raw hits, query: "+optQuery);
- hits = searcher.search(optQuery, numHitsRaw);
+ hits = searcher.search(optQuery, numHitsRaw,
+ dedupField, sortField, reverse);
LOG.info("found "+hits.getTotal()+" raw hits");
rawHitNum = -1;
continue;
@@ -189,28 +219,28 @@
continue;
seen.add(hit);
- // get site hits for its site
- String site = hit.getSite();
- SiteHits siteHits = (SiteHits)siteToHits.get(site);
- if (siteHits == null)
- siteToHits.put(site, siteHits = new SiteHits());
-
- // does this hit exceed maxHitsPerSite?
- if (siteHits.size() == maxHitsPerSite) { // yes -- ignore the hit
- if (!siteHits.maxSizeExceeded) {
-
- // mark prior hits with moreFromSiteExcluded
- for (int i = 0; i < siteHits.size(); i++) {
- ((Hit)siteHits.get(i)).setMoreFromSiteExcluded(true);
+ // get dup hits for its value
+ String value = hit.getDedupValue();
+ DupHits dupHits = (DupHits)dupToHits.get(value);
+ if (dupHits == null)
+ dupToHits.put(value, dupHits = new DupHits());
+
+ // does this hit exceed maxHitsPerDup?
+ if (dupHits.size() == maxHitsPerDup) { // yes -- ignore the hit
+ if (!dupHits.maxSizeExceeded) {
+
+ // mark prior hits with moreFromDupExcluded
+ for (int i = 0; i < dupHits.size(); i++) {
+ ((Hit)dupHits.get(i)).setMoreFromDupExcluded(true);
}
- siteHits.maxSizeExceeded = true;
+ dupHits.maxSizeExceeded = true;
- excludedSites.add(site); // exclude site
+ excludedValues.add(value); // exclude dup
}
totalIsExact = false;
} else { // no -- collect the hit
resultList.add(hit);
- siteHits.add(hit);
+ dupHits.add(hit);
// are we done?
// we need to find one more than asked for, so that we can tell if
Modified: incubator/nutch/trunk/src/java/org/apache/nutch/searcher/OpenSearchServlet.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/java/org/apache/nutch/searcher/OpenSearchServlet.java?rev=168178&r1=168177&r2=168178&view=diff
==============================================================================
--- incubator/nutch/trunk/src/java/org/apache/nutch/searcher/OpenSearchServlet.java (original)
+++ incubator/nutch/trunk/src/java/org/apache/nutch/searcher/OpenSearchServlet.java Wed May 4 12:57:20 2005
@@ -98,13 +98,18 @@
if (hitsPerSiteString != null)
hitsPerSite = Integer.parseInt(hitsPerSiteString);
+ String sort = request.getParameter("sort");
+ boolean reverse =
+ sort!=null && "true".equals(request.getParameter("reverse"));
+
Query query = Query.parse(queryString);
bean.LOG.info("query: " + queryString);
// execute the query
Hits hits;
try {
- hits = bean.search(query, start + hitsPerPage, hitsPerSite);
+ hits = bean.search(query, start + hitsPerPage, hitsPerSite, "site",
+ sort, reverse);
} catch (IOException e) {
bean.LOG.log(Level.WARNING, "Search Error", e);
hits = new Hits(0,new Hit[0]);
@@ -122,6 +127,9 @@
String requestUrl = request.getRequestURL().toString();
String base = requestUrl.substring(0, requestUrl.lastIndexOf('/'));
+ String params = "&hitsPerPage="+hitsPerPage
+ +(sort==null ? "" : "&sort="+sort+(reverse?"&reverse=true":""));
+
try {
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
@@ -130,7 +138,8 @@
Element rss = addNode(doc, doc, "rss");
addAttribute(doc, rss, "version", "2.0");
- addAttribute(doc, rss, "xmlns:os", (String)NS_MAP.get("opensearch"));
+ addAttribute(doc, rss, "xmlns:opensearch",
+ (String)NS_MAP.get("opensearch"));
addAttribute(doc, rss, "xmlns:nutch", (String)NS_MAP.get("nutch"));
Element channel = addNode(doc, rss, "channel");
@@ -142,8 +151,8 @@
base+"/search.jsp"
+"?query="+urlQuery
+"&start="+start
- +"&hitsPerPage="+hitsPerPage
- +"&hitsPerSite="+hitsPerSite);
+ +"&hitsPerSite="+hitsPerSite
+ +params);
addNode(doc, channel, "opensearch", "totalResults", ""+hits.getTotal());
addNode(doc, channel, "opensearch", "startIndex", ""+start);
@@ -157,15 +166,15 @@
addNode(doc, channel, "nutch", "nextPage", requestUrl
+"?query="+urlQuery
+"&start="+end
- +"&hitsPerPage="+hitsPerPage
- +"&hitsPerSite="+hitsPerSite);
+ +"&hitsPerSite="+hitsPerSite
+ +params);
}
if ((!hits.totalIsExact() && (hits.getLength() <= start+hitsPerPage))) {
addNode(doc, channel, "nutch", "showAllHits", requestUrl
+"?query="+urlQuery
- +"&hitsPerPage="+hitsPerPage
- +"&hitsPerSite="+0);
+ +"&hitsPerSite="+0
+ +params);
}
for (int i = 0; i < length; i++) {
@@ -184,18 +193,19 @@
addNode(doc, item, "description", summaries[i]);
addNode(doc, item, "link", url);
- addNode(doc, item, "nutch", "site", hit.getSite());
+ addNode(doc, item, "nutch", "site", hit.getDedupValue());
addNode(doc, item, "nutch", "cache", base+"/cached.jsp?"+id);
addNode(doc, item, "nutch", "explain", base+"/explain.jsp?"+id
+"&query="+urlQuery);
- if (hit.moreFromSiteExcluded()) {
+ if (hit.moreFromDupExcluded()) {
addNode(doc, item, "nutch", "moreFromSite", requestUrl
+"?query="
- +URLEncoder.encode("site:"+hit.getSite()+" "+queryString,
- "UTF-8")
- +"&hitsPerPage="+hitsPerPage+"&hitsPerSite="+0);
+ +URLEncoder.encode("site:"+hit.getDedupValue()
+ +" "+queryString, "UTF-8")
+ +"&hitsPerSite="+0
+ +params);
}
for (int j = 0; j < detail.getLength(); j++) { // add all from detail
Modified: incubator/nutch/trunk/src/java/org/apache/nutch/searcher/Searcher.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/java/org/apache/nutch/searcher/Searcher.java?rev=168178&r1=168177&r2=168178&view=diff
==============================================================================
--- incubator/nutch/trunk/src/java/org/apache/nutch/searcher/Searcher.java (original)
+++ incubator/nutch/trunk/src/java/org/apache/nutch/searcher/Searcher.java Wed May 4 12:57:20 2005
@@ -21,7 +21,9 @@
/** Service that searches. */
public interface Searcher {
/** Return the top-scoring hits for a query. */
- Hits search(Query query, int numHits) throws IOException;
+ Hits search(Query query, int numHits,
+ String dedupField, String sortField, boolean reverse)
+ throws IOException;
/** Return an HTML-formatted explanation of how a query scored. */
String getExplanation(Query query, Hit hit) throws IOException;
Modified: incubator/nutch/trunk/src/java/org/apache/nutch/servlet/Cached.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/java/org/apache/nutch/servlet/Cached.java?rev=168178&r1=168177&r2=168178&view=diff
==============================================================================
--- incubator/nutch/trunk/src/java/org/apache/nutch/servlet/Cached.java (original)
+++ incubator/nutch/trunk/src/java/org/apache/nutch/servlet/Cached.java Wed May 4 12:57:20 2005
@@ -69,7 +69,7 @@
bean.LOG.info("request from " + request.getRemoteAddr());
Hit hit = new Hit(Integer.parseInt(request.getParameter("idx")),
- Integer.parseInt(request.getParameter("id")), 0.0f, null);
+ Integer.parseInt(request.getParameter("id")));
HitDetails details = bean.getDetails(hit);
// raw bytes
Modified: incubator/nutch/trunk/src/web/jsp/anchors.jsp
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/web/jsp/anchors.jsp?rev=168178&r1=168177&r2=168178&view=diff
==============================================================================
--- incubator/nutch/trunk/src/web/jsp/anchors.jsp (original)
+++ incubator/nutch/trunk/src/web/jsp/anchors.jsp Wed May 4 12:57:20 2005
@@ -15,7 +15,7 @@
request.setCharacterEncoding("UTF-8");
bean.LOG.info("anchors request from " + request.getRemoteAddr());
Hit hit = new Hit(Integer.parseInt(request.getParameter("idx")),
- Integer.parseInt(request.getParameter("id")), 0.0f, null);
+ Integer.parseInt(request.getParameter("id")));
HitDetails details = bean.getDetails(hit);
String language =
ResourceBundle.getBundle("org.nutch.jsp.anchors", request.getLocale())
Modified: incubator/nutch/trunk/src/web/jsp/cached.jsp
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/web/jsp/cached.jsp?rev=168178&r1=168177&r2=168178&view=diff
==============================================================================
--- incubator/nutch/trunk/src/web/jsp/cached.jsp (original)
+++ incubator/nutch/trunk/src/web/jsp/cached.jsp Wed May 4 12:57:20 2005
@@ -11,7 +11,7 @@
NutchBean bean = NutchBean.get(application);
bean.LOG.info("cache request from " + request.getRemoteAddr());
Hit hit = new Hit(Integer.parseInt(request.getParameter("idx")),
- Integer.parseInt(request.getParameter("id")), 0.0f, null);
+ Integer.parseInt(request.getParameter("id")));
HitDetails details = bean.getDetails(hit);
String id = "idx=" + hit.getIndexNo() + "&id=" + hit.getIndexDocNo();
Modified: incubator/nutch/trunk/src/web/jsp/explain.jsp
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/web/jsp/explain.jsp?rev=168178&r1=168177&r2=168178&view=diff
==============================================================================
--- incubator/nutch/trunk/src/web/jsp/explain.jsp (original)
+++ incubator/nutch/trunk/src/web/jsp/explain.jsp Wed May 4 12:57:20 2005
@@ -13,7 +13,7 @@
request.setCharacterEncoding("UTF-8");
bean.LOG.info("explain request from " + request.getRemoteAddr());
Hit hit = new Hit(Integer.parseInt(request.getParameter("idx")),
- Integer.parseInt(request.getParameter("id")), 0.0f, null);
+ Integer.parseInt(request.getParameter("id")));
HitDetails details = bean.getDetails(hit);
Query query = Query.parse(request.getParameter("query"));
String language =
Modified: incubator/nutch/trunk/src/web/jsp/search.jsp
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/web/jsp/search.jsp?rev=168178&r1=168177&r2=168178&view=diff
==============================================================================
--- incubator/nutch/trunk/src/web/jsp/search.jsp (original)
+++ incubator/nutch/trunk/src/web/jsp/search.jsp Wed May 4 12:57:20 2005
@@ -155,7 +155,7 @@
// position.... this is good, bad?... ugly?....
Hits hits;
try{
- hits = bean.search(query, start + hitsToRetrieve, hitsPerSite);
+ hits = bean.search(query, start + hitsToRetrieve, hitsPerSite, "site");
} catch (IOException e){
hits = new Hits(0,new Hit[0]);
}
@@ -214,13 +214,13 @@
(<a href="../cached.jsp?<%=id%>"><i18n:message key="cached"/></a>)
(<a href="../explain.jsp?<%=id%>&query=<%=URLEncoder.encode(queryString)%>"><i18n:message key="explain"/></a>)
(<a href="../anchors.jsp?<%=id%>"><i18n:message key="anchors"/></a>)
- <% if (hit.moreFromSiteExcluded()) {
+ <% if (hit.moreFromDupExcluded()) {
String more =
- "query="+URLEncoder.encode("site:"+hit.getSite()+" "+queryString)
+ "query="+URLEncoder.encode("site:"+hit.getDedupValue()+" "+queryString)
+"&hitsPerPage="+hitsPerPage+"&hitsPerSite="+0
+"&clustering="+clustering;%>
(<a href="../search.jsp?<%=more%>"><i18n:message key="moreFrom"/>
- <%=hit.getSite()%></a>)
+ <%=hit.getDedupValue()%></a>)
<% } %>
<br><br>
<% } %>
Modified: incubator/nutch/trunk/src/web/jsp/text.jsp
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/web/jsp/text.jsp?rev=168178&r1=168177&r2=168178&view=diff
==============================================================================
--- incubator/nutch/trunk/src/web/jsp/text.jsp (original)
+++ incubator/nutch/trunk/src/web/jsp/text.jsp Wed May 4 12:57:20 2005
@@ -17,7 +17,7 @@
bean.LOG.info("text request from " + request.getRemoteAddr());
Hit hit = new Hit(Integer.parseInt(request.getParameter("idx")),
- Integer.parseInt(request.getParameter("id")), 0.0f, null);
+ Integer.parseInt(request.getParameter("id")));
HitDetails details = bean.getDetails(hit);
String text = bean.getParseText(details).getText();