You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by cu...@apache.org on 2005/05/05 23:40:08 UTC
svn commit: r168427 - in /incubator/nutch/trunk: CHANGES.txt
lib/lucene-1.4.2.jar lib/lucene-1.9-rc1-dev.jar
lib/lucene-misc-1.9-rc1-dev.jar
src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java
Author: cutting
Date: Thu May 5 14:40:07 2005
New Revision: 168427
URL: http://svn.apache.org/viewcvs?rev=168427&view=rev
Log:
Automatically convert range queries to range filters. Requires latest Lucene.
Added:
incubator/nutch/trunk/lib/lucene-1.9-rc1-dev.jar (with props)
incubator/nutch/trunk/lib/lucene-misc-1.9-rc1-dev.jar (with props)
Removed:
incubator/nutch/trunk/lib/lucene-1.4.2.jar
Modified:
incubator/nutch/trunk/CHANGES.txt
incubator/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java
Modified: incubator/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/CHANGES.txt?rev=168427&r1=168426&r2=168427&view=diff
==============================================================================
--- incubator/nutch/trunk/CHANGES.txt (original)
+++ incubator/nutch/trunk/CHANGES.txt Thu May 5 14:40:07 2005
@@ -76,6 +76,10 @@
16. Add support for sorting search results and search-time deduping by
fields other than site.
+17. Automatically convert range queries into cached range filters.
+ This improves the performance and scalability of, e.g., date range
+ searching.
+
Release 0.6
Added: incubator/nutch/trunk/lib/lucene-1.9-rc1-dev.jar
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/lib/lucene-1.9-rc1-dev.jar?rev=168427&view=auto
==============================================================================
Binary file - no diff available.
Propchange: incubator/nutch/trunk/lib/lucene-1.9-rc1-dev.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: incubator/nutch/trunk/lib/lucene-misc-1.9-rc1-dev.jar
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/lib/lucene-misc-1.9-rc1-dev.jar?rev=168427&view=auto
==============================================================================
Binary file - no diff available.
Propchange: incubator/nutch/trunk/lib/lucene-misc-1.9-rc1-dev.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Modified: incubator/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java?rev=168427&r1=168426&r2=168427&view=diff
==============================================================================
--- incubator/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java (original)
+++ incubator/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java Thu May 5 14:40:07 2005
@@ -17,33 +17,31 @@
package org.apache.nutch.searcher;
import org.apache.lucene.search.Searcher;
-import org.apache.lucene.search.BooleanQuery;
-import org.apache.lucene.search.BooleanClause;
-import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.QueryFilter;
-import org.apache.lucene.search.Filter;
-import org.apache.lucene.search.TopDocs;
-import org.apache.lucene.search.Sort;
+import org.apache.lucene.search.*;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.misc.ChainedFilter;
import java.util.LinkedHashMap;
import java.util.Map;
+import java.util.ArrayList;
+
import java.io.IOException;
/** Utility which converts certain query clauses into {@link QueryFilter}s and
- * caches these. Only required {@link TermQuery}s whose boost is zero and
- * whose term occurs in at least a certain fraction of documents are converted
- * to cached filters. This accellerates query constraints like language,
- * document format, etc., which do not affect ranking but might otherwise slow
- * search considerably. */
+ * caches these. Only required clauses whose boost is zero are converted to
+ * cached filters. Range queries are converted to range filters. This
+ * accellerates query constraints like date, language, document format, etc.,
+ * which do not affect ranking but might otherwise slow search considerably. */
class LuceneQueryOptimizer {
private LinkedHashMap cache; // an LRU cache of QueryFilter
private float threshold;
- /** Construct an optimizer that caches and uses filters for required {@link
- * TermQuery}s whose boost is zero.
+ /** Construct an optimizer that caches and uses filters for required clauses
+ * whose boost is zero.
* @param cacheSize the number of QueryFilters to cache
- * @param threshold the fraction of documents which must contain term
+ * @param threshold the fraction of documents which must contain a term
*/
public LuceneQueryOptimizer(final int cacheSize, float threshold) {
this.cache = new LinkedHashMap(cacheSize, 0.75f, true) {
@@ -60,33 +58,68 @@
throws IOException {
BooleanQuery query = new BooleanQuery();
- BooleanQuery filterQuery = null;
+ BooleanQuery cacheQuery = new BooleanQuery();
+ BooleanQuery filterQuery = new BooleanQuery();
+ ArrayList filters = new ArrayList();
BooleanClause[] clauses = original.getClauses();
for (int i = 0; i < clauses.length; i++) {
BooleanClause c = clauses[i];
if (c.required // required
- && c.query.getBoost() == 0.0f // boost is zero
- && c.query instanceof TermQuery // TermQuery
- && (searcher.docFreq(((TermQuery)c.query).getTerm())
- / (float)searcher.maxDoc()) >= threshold) { // check threshold
- if (filterQuery == null)
- filterQuery = new BooleanQuery();
+ && c.query.getBoost() == 0.0f) { // boost is zero
+
+ if (c.query instanceof TermQuery // TermQuery
+ && (searcher.docFreq(((TermQuery)c.query).getTerm())
+ / (float)searcher.maxDoc()) < threshold) { // beneath threshold
+ query.add(c); // don't filterize
+ continue;
+ }
+
+ if (c.query instanceof RangeQuery) { // RangeQuery
+ RangeQuery range = (RangeQuery)c.query;
+ boolean inclusive = range.isInclusive();// convert to RangeFilter
+ Term lower = range.getLowerTerm();
+ Term upper = range.getUpperTerm();
+ filters.add(new RangeFilter(lower!=null?lower.field():upper.field(),
+ lower != null ? lower.text() : null,
+ upper != null ? upper.text() : null,
+ inclusive, inclusive));
+ cacheQuery.add(c.query, true, false); // cache it
+ continue;
+ }
+
+ // all other query types
filterQuery.add(c.query, true, false); // filter it
- } else {
- query.add(c); // query it
+ cacheQuery.add(c.query, true, false); // cache it
+ continue;
}
+
+ query.add(c); // query it
}
Filter filter = null;
- if (filterQuery != null) {
+
+ if (cacheQuery.getClauses().length != 0) {
synchronized (cache) { // check cache
- filter = (Filter)cache.get(filterQuery);
+ filter = (Filter)cache.get(cacheQuery);
}
if (filter == null) { // miss
- filter = new QueryFilter(filterQuery); // construct new entry
+
+ if (filterQuery.getClauses().length != 0) // add filterQuery to filters
+ filters.add(new QueryFilter(filterQuery));
+
+ if (filters.size() == 1) { // convert filters to filter
+ filter = (Filter)filters.get(0);
+ } else {
+ filter = new ChainedFilter((Filter[])filters.toArray
+ (new Filter[filters.size()]),
+ ChainedFilter.AND);
+ }
+ if (!(filter instanceof QueryFilter)) // make sure bits are cached
+ filter = new CachingWrapperFilter(filter);
+
synchronized (cache) {
- cache.put(filterQuery, filter); // cache it
+ cache.put(cacheQuery, filter); // cache the filter
}
}
}