You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by si...@apache.org on 2009/02/18 13:53:13 UTC
svn commit: r745503 - in /lucene/nutch/trunk: CHANGES.txt
conf/nutch-default.xml
src/plugin/query-basic/src/java/org/apache/nutch/searcher/basic/BasicQueryFilter.java
Author: siren
Date: Wed Feb 18 12:53:12 2009
New Revision: 745503
URL: http://svn.apache.org/viewvc?rev=745503&view=rev
Log:
NUTCH-563 Include custom fields in BasicQueryFilter, contributed by Julien Nioche
Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/conf/nutch-default.xml
lucene/nutch/trunk/src/plugin/query-basic/src/java/org/apache/nutch/searcher/basic/BasicQueryFilter.java
Modified: lucene/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=745503&r1=745502&r2=745503&view=diff
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Wed Feb 18 12:53:12 2009
@@ -346,6 +346,9 @@
129. NUTCH-691 - Update jakarta poi jars to the most relevant version
(Dmitry Lihachev via siren)
+130. NUTCH-563 - Include custom fields in BasicQueryFilter
+ (Julien Nioche via siren)
+
Release 0.9 - 2007-04-02
1. Changed log4j confiquration to log to stdout on commandline
Modified: lucene/nutch/trunk/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/nutch-default.xml?rev=745503&r1=745502&r2=745503&view=diff
==============================================================================
--- lucene/nutch/trunk/conf/nutch-default.xml (original)
+++ lucene/nutch/trunk/conf/nutch-default.xml Wed Feb 18 12:53:12 2009
@@ -1119,6 +1119,15 @@
</description>
</property>
+<!--
+<property>
+ <name>query.basic.description.boost</name>
+ <value>1.0</value>
+ <description> Declares a custom field and its boost to be added to the default fields of the Lucene query.
+ </description>
+</property>
+-->
+
<!-- creative-commons plugin properties -->
<property>
Modified: lucene/nutch/trunk/src/plugin/query-basic/src/java/org/apache/nutch/searcher/basic/BasicQueryFilter.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/query-basic/src/java/org/apache/nutch/searcher/basic/BasicQueryFilter.java?rev=745503&r1=745502&r2=745503&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/query-basic/src/java/org/apache/nutch/searcher/basic/BasicQueryFilter.java (original)
+++ lucene/nutch/trunk/src/plugin/query-basic/src/java/org/apache/nutch/searcher/basic/BasicQueryFilter.java Wed Feb 18 12:53:12 2009
@@ -22,6 +22,13 @@
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.TermQuery;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
import org.apache.nutch.analysis.NutchDocumentAnalyzer;
import org.apache.nutch.analysis.CommonGrams;
@@ -31,7 +38,12 @@
import org.apache.hadoop.conf.Configuration;
/** The default query filter. Query terms in the default query field are
- * expanded to search the url, anchor and content document fields.*/
+ * expanded to search the url, anchor and content document fields.
+ * Additional fields can be added by specifying parameters of the form : query.basic.(fieldname).boost
+ * to the configuration files (see nutch-default.xml for an example).Such fields will be used in the clauses
+ * generated by the BasicQueryFilter e.g. for a user query A B, it generates +(field1:A field2:A ...) +(field1:B field2:B....).
+ * If you don't want the additional fields to be included in the clauses you will need to implement a custom query filter for it.
+ **/
public class BasicQueryFilter implements QueryFilter {
private static final int URL_BOOST = 0;
@@ -44,7 +56,7 @@
private float PHRASE_BOOST;
- private static final String[] FIELDS =
+ private String[] FIELDS =
{ "url", "anchor", "content", "title", "host" };
private float[] FIELD_BOOSTS = new float[5];
@@ -177,9 +189,51 @@
this.FIELD_BOOSTS[TITLE_BOOST] = conf.getFloat("query.title.boost", 1.5f);
this.FIELD_BOOSTS[HOST_BOOST] = conf.getFloat("query.host.boost", 2.0f);
this.PHRASE_BOOST = conf.getFloat("query.phrase.boost", 1.0f);
+ findAdditionalFields(conf);
}
public Configuration getConf() {
return this.conf;
}
+
+ /** Searches for parameters of the form : query.basic.(fieldname).boost
+ * and adds the fielname to the list of default fields.
+ **/
+ private void findAdditionalFields(Configuration conf) {
+ // get additional fields specified in parameters
+ Pattern pat = Pattern.compile("query\\.basic\\.(.+)\\.boost");
+ Iterator confEntriesIterator = conf.iterator();
+ List existingFields = java.util.Arrays.asList(FIELDS);
+ ArrayList tempfieldNames = new ArrayList();
+ ArrayList tempfieldBoosts = new ArrayList();
+ while (confEntriesIterator.hasNext()){
+ Map.Entry entry = (Map.Entry) confEntriesIterator.next();
+ String key = entry.getKey().toString();
+ Matcher match = pat.matcher(key);
+ if (!match.matches())continue;
+ String fieldName = match.group(1);
+ if (fieldName!=null){
+ // check whether it matches one of the fields which are used by default
+ if (existingFields.contains(fieldName)) continue;
+ // reserved keyword
+ if (fieldName.equals("phrase")) continue;
+ float boostCustomField = conf.getFloat(key, 2.0f);
+ tempfieldNames.add(fieldName);
+ tempfieldBoosts.add(Float.valueOf(boostCustomField));
+ }
+ }
+ if (tempfieldNames.size()==0) return;
+ // store additional fields names and boost values in corresponding fields
+ String[] tempNames = new String[5+tempfieldNames.size()];
+ float[] tempBoosts = new float[5+tempfieldNames.size()];
+ System.arraycopy(FIELDS, 0,tempNames, 0, 5);
+ System.arraycopy(this.FIELD_BOOSTS, 0,tempBoosts, 0, 5);
+ for (int newF=0; newF < tempfieldNames.size();newF++){
+ tempNames[5+newF]=(String) tempfieldNames.get(newF);
+ tempBoosts[5+newF]= ((Float)tempfieldBoosts.get(newF)).floatValue();
+ }
+ // replace original fields
+ this.FIELDS = tempNames;
+ this.FIELD_BOOSTS = tempBoosts;
+ }
}