You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by si...@apache.org on 2009/02/18 13:53:13 UTC

svn commit: r745503 - in /lucene/nutch/trunk: CHANGES.txt conf/nutch-default.xml src/plugin/query-basic/src/java/org/apache/nutch/searcher/basic/BasicQueryFilter.java

Author: siren
Date: Wed Feb 18 12:53:12 2009
New Revision: 745503

URL: http://svn.apache.org/viewvc?rev=745503&view=rev
Log:
NUTCH-563 Include custom fields in BasicQueryFilter, contributed by Julien Nioche

Modified:
    lucene/nutch/trunk/CHANGES.txt
    lucene/nutch/trunk/conf/nutch-default.xml
    lucene/nutch/trunk/src/plugin/query-basic/src/java/org/apache/nutch/searcher/basic/BasicQueryFilter.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=745503&r1=745502&r2=745503&view=diff
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Wed Feb 18 12:53:12 2009
@@ -346,6 +346,9 @@
 129. NUTCH-691 - Update jakarta poi jars to the most relevant version
      (Dmitry Lihachev via siren)
 
+130. NUTCH-563 - Include custom fields in BasicQueryFilter
+     (Julien Nioche via siren)
+
 Release 0.9 - 2007-04-02
 
  1. Changed log4j confiquration to log to stdout on commandline

Modified: lucene/nutch/trunk/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/nutch-default.xml?rev=745503&r1=745502&r2=745503&view=diff
==============================================================================
--- lucene/nutch/trunk/conf/nutch-default.xml (original)
+++ lucene/nutch/trunk/conf/nutch-default.xml Wed Feb 18 12:53:12 2009
@@ -1119,6 +1119,15 @@
   </description>
 </property>
 
+<!--
+<property>
+  <name>query.basic.description.boost</name>
+  <value>1.0</value>
+  <description> Declares a custom field and its boost to be added to the default fields of the Lucene query.
+  </description>
+</property>
+-->
+
 <!-- creative-commons plugin properties -->
 
 <property>

Modified: lucene/nutch/trunk/src/plugin/query-basic/src/java/org/apache/nutch/searcher/basic/BasicQueryFilter.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/query-basic/src/java/org/apache/nutch/searcher/basic/BasicQueryFilter.java?rev=745503&r1=745502&r2=745503&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/query-basic/src/java/org/apache/nutch/searcher/basic/BasicQueryFilter.java (original)
+++ lucene/nutch/trunk/src/plugin/query-basic/src/java/org/apache/nutch/searcher/basic/BasicQueryFilter.java Wed Feb 18 12:53:12 2009
@@ -22,6 +22,13 @@
 import org.apache.lucene.search.PhraseQuery;
 import org.apache.lucene.search.TermQuery;
 
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
 import org.apache.nutch.analysis.NutchDocumentAnalyzer;
 import org.apache.nutch.analysis.CommonGrams;
 
@@ -31,7 +38,12 @@
 import org.apache.hadoop.conf.Configuration;
 
 /** The default query filter.  Query terms in the default query field are
- * expanded to search the url, anchor and content document fields.*/
+ * expanded to search the url, anchor and content document fields.
+ * Additional fields can be added by specifying parameters of the form : query.basic.(fieldname).boost
+ * to the configuration files (see nutch-default.xml for an example).Such fields will be used in the clauses
+ * generated by the BasicQueryFilter e.g. for a user query A B, it generates +(field1:A field2:A ...) +(field1:B field2:B....).
+ * If you don't want the additional fields to be included in the clauses you will need to implement a custom query filter for it.
+ **/
 public class BasicQueryFilter implements QueryFilter {
     
   private static final int  URL_BOOST       = 0;
@@ -44,7 +56,7 @@
 
   private float PHRASE_BOOST;
 
-  private static final String[] FIELDS =
+  private String[] FIELDS =
   { "url", "anchor", "content", "title", "host" };
 
   private float[] FIELD_BOOSTS = new float[5];
@@ -177,9 +189,51 @@
     this.FIELD_BOOSTS[TITLE_BOOST] = conf.getFloat("query.title.boost", 1.5f);
     this.FIELD_BOOSTS[HOST_BOOST] = conf.getFloat("query.host.boost", 2.0f);
     this.PHRASE_BOOST = conf.getFloat("query.phrase.boost", 1.0f);
+    findAdditionalFields(conf);
   }
 
   public Configuration getConf() {
     return this.conf;
   }
+  
+  /** Searches for parameters of the form : query.basic.(fieldname).boost
+   * and adds the fielname to the list of default fields.
+   **/
+  private void findAdditionalFields(Configuration conf) {
+    // get additional fields specified in parameters
+    Pattern pat = Pattern.compile("query\\.basic\\.(.+)\\.boost");
+    Iterator confEntriesIterator = conf.iterator(); 
+    List existingFields = java.util.Arrays.asList(FIELDS);  
+    ArrayList tempfieldNames = new ArrayList();
+    ArrayList tempfieldBoosts = new ArrayList();
+    while (confEntriesIterator.hasNext()){
+      Map.Entry entry = (Map.Entry) confEntriesIterator.next();
+      String key = entry.getKey().toString();
+      Matcher match = pat.matcher(key);
+      if (!match.matches())continue;
+      String fieldName = match.group(1);
+      if (fieldName!=null){
+        // check whether it matches one of the fields which are used by default
+        if (existingFields.contains(fieldName)) continue;
+        // reserved keyword
+        if (fieldName.equals("phrase")) continue;
+        float boostCustomField = conf.getFloat(key, 2.0f);
+        tempfieldNames.add(fieldName);
+        tempfieldBoosts.add(Float.valueOf(boostCustomField));
+      }
+    }
+    if (tempfieldNames.size()==0) return;
+    // store additional fields names and boost values in corresponding fields
+    String[] tempNames = new String[5+tempfieldNames.size()];
+    float[] tempBoosts = new float[5+tempfieldNames.size()];
+    System.arraycopy(FIELDS, 0,tempNames, 0, 5);
+    System.arraycopy(this.FIELD_BOOSTS, 0,tempBoosts, 0, 5);
+    for (int newF=0; newF  < tempfieldNames.size();newF++){
+      tempNames[5+newF]=(String) tempfieldNames.get(newF);
+      tempBoosts[5+newF]= ((Float)tempfieldBoosts.get(newF)).floatValue();
+    }
+    // replace original fields
+    this.FIELDS = tempNames;
+    this.FIELD_BOOSTS = tempBoosts;
+  }
 }