You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ku...@apache.org on 2008/12/04 22:29:22 UTC

svn commit: r723447 [2/2] - in /lucene/nutch/trunk/src: java/org/apache/nutch/indexer/field/ plugin/ plugin/field-basic/ plugin/field-basic/src/ plugin/field-basic/src/java/ plugin/field-basic/src/java/org/ plugin/field-basic/src/java/org/apache/ plugi...

Added: lucene/nutch/trunk/src/plugin/field-basic/build.xml
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/field-basic/build.xml?rev=723447&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/field-basic/build.xml (added)
+++ lucene/nutch/trunk/src/plugin/field-basic/build.xml Thu Dec  4 13:29:21 2008
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="field-basic" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+</project>

Added: lucene/nutch/trunk/src/plugin/field-basic/plugin.xml
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/field-basic/plugin.xml?rev=723447&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/field-basic/plugin.xml (added)
+++ lucene/nutch/trunk/src/plugin/field-basic/plugin.xml Thu Dec  4 13:29:21 2008
@@ -0,0 +1,41 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="field-basic"
+   name="Basic Field Filter"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="field-basic.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+   <extension id="org.apache.nutch.indexer.field.basic"
+              name="Nutch Basic Field Filter"
+              point="org.apache.nutch.indexer.field.FieldFilter">
+      <implementation id="BasicFieldFilter"
+        class="org.apache.nutch.indexer.field.basic.BasicFieldFilter"/>
+   </extension>
+
+</plugin>

Added: lucene/nutch/trunk/src/plugin/field-basic/src/java/org/apache/nutch/indexer/field/basic/BasicFieldFilter.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/field-basic/src/java/org/apache/nutch/indexer/field/basic/BasicFieldFilter.java?rev=723447&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/field-basic/src/java/org/apache/nutch/indexer/field/basic/BasicFieldFilter.java (added)
+++ lucene/nutch/trunk/src/plugin/field-basic/src/java/org/apache/nutch/indexer/field/basic/BasicFieldFilter.java Thu Dec  4 13:29:21 2008
@@ -0,0 +1,89 @@
+package org.apache.nutch.indexer.field.basic;
+
+import java.util.List;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.field.FieldFilter;
+import org.apache.nutch.indexer.field.FieldType;
+import org.apache.nutch.indexer.field.FieldWritable;
+
+/**
+ * Adds any field of type content to the index.
+ */
+public class BasicFieldFilter
+  implements FieldFilter {
+
+  public static final Log LOG = LogFactory.getLog(BasicFieldFilter.class);
+  private Configuration conf;
+  private boolean supplemental = false;
+  private String[] suppFields = null;
+
+  private boolean isSupplementalField(String name) {
+    for (int i = 0; i < suppFields.length; i++) {
+      if (name != null && name.equals(suppFields[i])) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    this.supplemental = conf.getBoolean("index.supplemental", false);
+    String suppStr = conf.get("index.supplemental.fields", null);
+    if (suppStr != null) {
+      suppFields = suppStr.split(",");
+    }
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+  public Document filter(String url, Document doc, List<FieldWritable> fields)
+    throws IndexingException {
+
+    // loop through all of the fields
+    for (FieldWritable field : fields) {
+
+      // only grab content fields
+      FieldType type = field.getType();
+      if (type == FieldType.CONTENT) {
+
+        String fieldName = field.getName();
+        
+        // supplemental index will only index certain key fields, allow creating
+        // both a regular and a supplemental index
+        if (!supplemental || (supplemental && isSupplementalField(fieldName))) {
+
+          // create lucene fields from the FieldWritable objects
+          Field.Store store = field.isStored() ? Field.Store.YES
+            : Field.Store.NO;
+          Field.Index indexed = field.isIndexed() ? field.isTokenized()
+            ? Field.Index.TOKENIZED : Field.Index.UN_TOKENIZED : Field.Index.NO;
+          Field docField = new Field(fieldName, field.getValue(), store,
+            indexed);
+
+          // if any field boost then set it
+          float fieldBoost = field.getBoost();
+          if (fieldBoost > 0) {
+            docField.setBoost(fieldBoost);
+          }
+
+          // add the field to the lucene document
+          doc.add(docField);
+        }
+        else {
+          LOG.info("Ignoring " + fieldName + " field for " + url + " supplemental index");
+        }
+      }
+    }
+
+    return doc;
+  }
+}

Added: lucene/nutch/trunk/src/plugin/field-boost/build.xml
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/field-boost/build.xml?rev=723447&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/field-boost/build.xml (added)
+++ lucene/nutch/trunk/src/plugin/field-boost/build.xml Thu Dec  4 13:29:21 2008
@@ -0,0 +1,40 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="field-boost" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+  <!-- Build compilation dependencies -->
+  <target name="deps-jar">
+    <ant target="jar" inheritall="false" dir="../lib-arity"/>
+  </target>
+
+  <!-- Add compilation dependencies to classpath -->
+  <path id="plugin.deps">
+    <fileset dir="${nutch.root}/build">
+      <include name="**/lib-arity/*.jar" />
+    </fileset>
+  </path>
+
+  <!-- Deploy Unit test dependencies -->
+  <target name="deps-test">
+    <ant target="deploy" inheritall="false" dir="../lib-arity"/>
+    <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
+  </target>
+
+</project>

Added: lucene/nutch/trunk/src/plugin/field-boost/plugin.xml
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/field-boost/plugin.xml?rev=723447&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/field-boost/plugin.xml (added)
+++ lucene/nutch/trunk/src/plugin/field-boost/plugin.xml Thu Dec  4 13:29:21 2008
@@ -0,0 +1,41 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="field-boost"
+   name="Boost Field Filter"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="field-boost.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+   <extension id="org.apache.nutch.indexer.field.boost"
+              name="Nutch Boost Field Filter"
+              point="org.apache.nutch.indexer.field.FieldFilter">
+      <implementation id="BoostFieldFilter"
+        class="org.apache.nutch.indexer.field.boost.BoostFieldFilter"/>
+   </extension>
+
+</plugin>

Added: lucene/nutch/trunk/src/plugin/field-boost/src/java/org/apache/nutch/indexer/field/boost/BoostFieldFilter.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/field-boost/src/java/org/apache/nutch/indexer/field/boost/BoostFieldFilter.java?rev=723447&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/field-boost/src/java/org/apache/nutch/indexer/field/boost/BoostFieldFilter.java (added)
+++ lucene/nutch/trunk/src/plugin/field-boost/src/java/org/apache/nutch/indexer/field/boost/BoostFieldFilter.java Thu Dec  4 13:29:21 2008
@@ -0,0 +1,68 @@
+package org.apache.nutch.indexer.field.boost;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.field.FieldFilter;
+import org.apache.nutch.indexer.field.FieldType;
+import org.apache.nutch.indexer.field.FieldWritable;
+import org.apache.nutch.indexer.field.Fields;
+
+/**
+ * A field filter that indexes fields of content type Boost or type Computation.
+ * 
+ * Boost fields are aggregated together to create a global score for a single 
+ * Lucene document in the index.  An example of a Boost fields would be the 
+ * LinkRank score.
+ */
+public class BoostFieldFilter
+  implements FieldFilter {
+
+  public static final Log LOG = LogFactory.getLog(BoostFieldFilter.class);
+  private Configuration conf;
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+  public Document filter(String url, Document doc, List<FieldWritable> fields)
+    throws IndexingException {
+
+    List<String> comps = new ArrayList<String>();
+    float boost = 0.0f;
+
+    for (FieldWritable field : fields) {
+
+      // save the boost factor as unindexed fields, to show different scoring
+      FieldType type = field.getType();
+      if (type == FieldType.BOOST) {
+        float fieldBoost = field.getBoost();
+        boost += fieldBoost;
+        doc.add(new Field(Fields.BOOSTFACTOR, field.getValue() + ": "
+          + fieldBoost, Field.Store.YES, Field.Index.NO));
+      }
+      else if (type == FieldType.COMPUTATION) {
+        comps.add(field.getValue());
+      }
+    }
+
+    // set the boost for the document and save it in the index
+    doc.setBoost(boost);
+    doc.add(new Field(Fields.BOOST, Float.toString(boost), Field.Store.YES,
+      Field.Index.NO));
+    
+    
+    return doc;
+  }
+
+}

Modified: lucene/nutch/trunk/src/plugin/nutch-extensionpoints/plugin.xml
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/nutch-extensionpoints/plugin.xml?rev=723447&r1=723446&r2=723447&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/nutch-extensionpoints/plugin.xml (original)
+++ lucene/nutch/trunk/src/plugin/nutch-extensionpoints/plugin.xml Thu Dec  4 13:29:21 2008
@@ -29,6 +29,10 @@
       name="Nutch Online Search Results Clustering Plugin"/>
 
 <extension-point
+      id="org.apache.nutch.indexer.field.FieldFilter"
+      name="Nutch Field Filter"/>
+      
+<extension-point
       id="org.apache.nutch.indexer.IndexingFilter"
       name="Nutch Indexing Filter"/>
 

Added: lucene/nutch/trunk/src/plugin/query-custom/build.xml
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/query-custom/build.xml?rev=723447&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/query-custom/build.xml (added)
+++ lucene/nutch/trunk/src/plugin/query-custom/build.xml Thu Dec  4 13:29:21 2008
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="query-custom" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+</project>

Added: lucene/nutch/trunk/src/plugin/query-custom/plugin.xml
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/query-custom/plugin.xml?rev=723447&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/query-custom/plugin.xml (added)
+++ lucene/nutch/trunk/src/plugin/query-custom/plugin.xml Thu Dec  4 13:29:21 2008
@@ -0,0 +1,44 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="query-custom"
+   name="Custom Query Filter"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="query-custom.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+   
+   <extension id="org.apache.nutch.searcher.custom"
+              name="Nutch Custom Field Query Filter"
+              point="org.apache.nutch.searcher.QueryFilter">
+      <implementation id="CustomQueryFilter"
+        class="org.apache.nutch.searcher.custom.CustomFieldQueryFilter">
+        <parameter name="fields" value="lang" />
+      </implementation>
+      
+   </extension>
+
+</plugin>

Added: lucene/nutch/trunk/src/plugin/query-custom/src/java/org/apache/nutch/searcher/custom/CustomFieldQueryFilter.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/query-custom/src/java/org/apache/nutch/searcher/custom/CustomFieldQueryFilter.java?rev=723447&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/query-custom/src/java/org/apache/nutch/searcher/custom/CustomFieldQueryFilter.java (added)
+++ lucene/nutch/trunk/src/plugin/query-custom/src/java/org/apache/nutch/searcher/custom/CustomFieldQueryFilter.java Thu Dec  4 13:29:21 2008
@@ -0,0 +1,125 @@
+package org.apache.nutch.searcher.custom;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Enumeration;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Properties;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.TermQuery;
+import org.apache.nutch.indexer.field.CustomFields;
+import org.apache.nutch.searcher.Query;
+import org.apache.nutch.searcher.QueryException;
+import org.apache.nutch.searcher.QueryFilter;
+import org.apache.nutch.searcher.Query.Clause;
+
+public class CustomFieldQueryFilter
+  implements QueryFilter {
+
+  public static final Log LOG = LogFactory.getLog(CustomFields.class);
+  private Configuration conf;
+  private List<String> fieldNames = new ArrayList<String>();
+  private Map<String, Float> boosts = new HashMap<String, Float>();
+
+  public CustomFieldQueryFilter() {
+  }
+
+  public void setConf(Configuration conf) {
+
+    try {
+      this.conf = conf;
+      FileSystem fs = FileSystem.get(conf);
+      String configFile = conf.get("custom.fields.config", "custom-fields.xml");
+      LOG.info("Reading configuration field configuration from " + configFile);
+      Properties customFieldProps = new Properties();
+      InputStream fis = CustomFields.class.getClassLoader().getResourceAsStream(
+        configFile);
+      if (fis == null) {
+        throw new IOException("Was unable to open " + configFile);
+      }
+      customFieldProps.loadFromXML(fis);
+      Enumeration keys = customFieldProps.keys();
+      while (keys.hasMoreElements()) {
+        String prop = (String)keys.nextElement();
+        if (prop.endsWith(".name")) {
+          String propName = prop.substring(0, prop.length() - 5);
+          String name = customFieldProps.getProperty(prop);
+          fieldNames.add(name);
+          String boostKey = propName + ".boost";
+          if (customFieldProps.containsKey(boostKey)) {
+            float boost = Float.parseFloat(customFieldProps.getProperty(boostKey));
+            boosts.put(name, boost);
+          }
+        }
+      }
+    }
+    catch (Exception e) {
+      LOG.error("Error loading custom field properties:\n"
+        + StringUtils.stringifyException(e));
+    }
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+  public BooleanQuery filter(Query input, BooleanQuery output)
+    throws QueryException {
+
+    // examine each clause in the Nutch query
+    Clause[] clauses = input.getClauses();
+    for (int i = 0; i < clauses.length; i++) {
+      Clause c = clauses[i];
+
+      // skip non-matching clauses
+      String fieldName = c.getField();
+      if (!fieldNames.contains(fieldName)) {
+        continue;
+      }
+      String value = c.getTerm().toString().toLowerCase();
+
+      // add a Lucene TermQuery for this clause
+      TermQuery clause = new TermQuery(new Term(fieldName, value));
+      // set boost
+      if (boosts.containsKey(fieldName)) {
+        clause.setBoost(boosts.get(fieldName));
+      }
+
+      // add it as specified in query
+      output.add(clause, (c.isProhibited() ? BooleanClause.Occur.MUST_NOT
+        : (c.isRequired() ? BooleanClause.Occur.MUST
+          : BooleanClause.Occur.SHOULD)));
+    }
+
+    // return the modified Lucene query
+    return output;
+  }
+}