You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ku...@apache.org on 2008/12/04 22:29:22 UTC
svn commit: r723447 [2/2] - in /lucene/nutch/trunk/src:
java/org/apache/nutch/indexer/field/ plugin/ plugin/field-basic/
plugin/field-basic/src/ plugin/field-basic/src/java/
plugin/field-basic/src/java/org/ plugin/field-basic/src/java/org/apache/
plugi...
Added: lucene/nutch/trunk/src/plugin/field-basic/build.xml
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/field-basic/build.xml?rev=723447&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/field-basic/build.xml (added)
+++ lucene/nutch/trunk/src/plugin/field-basic/build.xml Thu Dec 4 13:29:21 2008
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="field-basic" default="jar-core">
+
+ <import file="../build-plugin.xml"/>
+
+</project>
Added: lucene/nutch/trunk/src/plugin/field-basic/plugin.xml
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/field-basic/plugin.xml?rev=723447&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/field-basic/plugin.xml (added)
+++ lucene/nutch/trunk/src/plugin/field-basic/plugin.xml Thu Dec 4 13:29:21 2008
@@ -0,0 +1,41 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+ id="field-basic"
+ name="Basic Field Filter"
+ version="1.0.0"
+ provider-name="nutch.org">
+
+ <runtime>
+ <library name="field-basic.jar">
+ <export name="*"/>
+ </library>
+ </runtime>
+
+ <requires>
+ <import plugin="nutch-extensionpoints"/>
+ </requires>
+
+ <extension id="org.apache.nutch.indexer.field.basic"
+ name="Nutch Basic Field Filter"
+ point="org.apache.nutch.indexer.field.FieldFilter">
+ <implementation id="BasicFieldFilter"
+ class="org.apache.nutch.indexer.field.basic.BasicFieldFilter"/>
+ </extension>
+
+</plugin>
Added: lucene/nutch/trunk/src/plugin/field-basic/src/java/org/apache/nutch/indexer/field/basic/BasicFieldFilter.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/field-basic/src/java/org/apache/nutch/indexer/field/basic/BasicFieldFilter.java?rev=723447&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/field-basic/src/java/org/apache/nutch/indexer/field/basic/BasicFieldFilter.java (added)
+++ lucene/nutch/trunk/src/plugin/field-basic/src/java/org/apache/nutch/indexer/field/basic/BasicFieldFilter.java Thu Dec 4 13:29:21 2008
@@ -0,0 +1,89 @@
+package org.apache.nutch.indexer.field.basic;
+
+import java.util.List;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.field.FieldFilter;
+import org.apache.nutch.indexer.field.FieldType;
+import org.apache.nutch.indexer.field.FieldWritable;
+
+/**
+ * Adds any field of type content to the index.
+ */
+public class BasicFieldFilter
+ implements FieldFilter {
+
+ public static final Log LOG = LogFactory.getLog(BasicFieldFilter.class);
+ private Configuration conf;
+ private boolean supplemental = false;
+ private String[] suppFields = null;
+
+ private boolean isSupplementalField(String name) {
+ for (int i = 0; i < suppFields.length; i++) {
+ if (name != null && name.equals(suppFields[i])) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+ this.supplemental = conf.getBoolean("index.supplemental", false);
+ String suppStr = conf.get("index.supplemental.fields", null);
+ if (suppStr != null) {
+ suppFields = suppStr.split(",");
+ }
+ }
+
+ public Configuration getConf() {
+ return this.conf;
+ }
+
+ public Document filter(String url, Document doc, List<FieldWritable> fields)
+ throws IndexingException {
+
+ // loop through all of the fields
+ for (FieldWritable field : fields) {
+
+ // only grab content fields
+ FieldType type = field.getType();
+ if (type == FieldType.CONTENT) {
+
+ String fieldName = field.getName();
+
+ // supplemental index will only index certain key fields, allow creating
+ // both a regular and a supplemental index
+ if (!supplemental || (supplemental && isSupplementalField(fieldName))) {
+
+ // create lucene fields from the FieldWritable objects
+ Field.Store store = field.isStored() ? Field.Store.YES
+ : Field.Store.NO;
+ Field.Index indexed = field.isIndexed() ? field.isTokenized()
+ ? Field.Index.TOKENIZED : Field.Index.UN_TOKENIZED : Field.Index.NO;
+ Field docField = new Field(fieldName, field.getValue(), store,
+ indexed);
+
+ // if any field boost then set it
+ float fieldBoost = field.getBoost();
+ if (fieldBoost > 0) {
+ docField.setBoost(fieldBoost);
+ }
+
+ // add the field to the lucene document
+ doc.add(docField);
+ }
+ else {
+ LOG.info("Ignoring " + fieldName + " field for " + url + " supplemental index");
+ }
+ }
+ }
+
+ return doc;
+ }
+}
Added: lucene/nutch/trunk/src/plugin/field-boost/build.xml
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/field-boost/build.xml?rev=723447&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/field-boost/build.xml (added)
+++ lucene/nutch/trunk/src/plugin/field-boost/build.xml Thu Dec 4 13:29:21 2008
@@ -0,0 +1,40 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="field-boost" default="jar-core">
+
+ <import file="../build-plugin.xml"/>
+
+ <!-- Build compilation dependencies -->
+ <target name="deps-jar">
+ <ant target="jar" inheritall="false" dir="../lib-arity"/>
+ </target>
+
+ <!-- Add compilation dependencies to classpath -->
+ <path id="plugin.deps">
+ <fileset dir="${nutch.root}/build">
+ <include name="**/lib-arity/*.jar" />
+ </fileset>
+ </path>
+
+ <!-- Deploy Unit test dependencies -->
+ <target name="deps-test">
+ <ant target="deploy" inheritall="false" dir="../lib-arity"/>
+ <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
+ </target>
+
+</project>
Added: lucene/nutch/trunk/src/plugin/field-boost/plugin.xml
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/field-boost/plugin.xml?rev=723447&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/field-boost/plugin.xml (added)
+++ lucene/nutch/trunk/src/plugin/field-boost/plugin.xml Thu Dec 4 13:29:21 2008
@@ -0,0 +1,41 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+ id="field-boost"
+ name="Boost Field Filter"
+ version="1.0.0"
+ provider-name="nutch.org">
+
+ <runtime>
+ <library name="field-boost.jar">
+ <export name="*"/>
+ </library>
+ </runtime>
+
+ <requires>
+ <import plugin="nutch-extensionpoints"/>
+ </requires>
+
+ <extension id="org.apache.nutch.indexer.field.boost"
+ name="Nutch Boost Field Filter"
+ point="org.apache.nutch.indexer.field.FieldFilter">
+ <implementation id="BoostFieldFilter"
+ class="org.apache.nutch.indexer.field.boost.BoostFieldFilter"/>
+ </extension>
+
+</plugin>
Added: lucene/nutch/trunk/src/plugin/field-boost/src/java/org/apache/nutch/indexer/field/boost/BoostFieldFilter.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/field-boost/src/java/org/apache/nutch/indexer/field/boost/BoostFieldFilter.java?rev=723447&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/field-boost/src/java/org/apache/nutch/indexer/field/boost/BoostFieldFilter.java (added)
+++ lucene/nutch/trunk/src/plugin/field-boost/src/java/org/apache/nutch/indexer/field/boost/BoostFieldFilter.java Thu Dec 4 13:29:21 2008
@@ -0,0 +1,68 @@
+package org.apache.nutch.indexer.field.boost;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.field.FieldFilter;
+import org.apache.nutch.indexer.field.FieldType;
+import org.apache.nutch.indexer.field.FieldWritable;
+import org.apache.nutch.indexer.field.Fields;
+
+/**
+ * A field filter that indexes fields of content type Boost or type Computation.
+ *
+ * Boost fields are aggregated together to create a global score for a single
+ * Lucene document in the index. An example of a Boost fields would be the
+ * LinkRank score.
+ */
+public class BoostFieldFilter
+ implements FieldFilter {
+
+ public static final Log LOG = LogFactory.getLog(BoostFieldFilter.class);
+ private Configuration conf;
+
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+ }
+
+ public Configuration getConf() {
+ return this.conf;
+ }
+
+ public Document filter(String url, Document doc, List<FieldWritable> fields)
+ throws IndexingException {
+
+ List<String> comps = new ArrayList<String>();
+ float boost = 0.0f;
+
+ for (FieldWritable field : fields) {
+
+ // save the boost factor as unindexed fields, to show different scoring
+ FieldType type = field.getType();
+ if (type == FieldType.BOOST) {
+ float fieldBoost = field.getBoost();
+ boost += fieldBoost;
+ doc.add(new Field(Fields.BOOSTFACTOR, field.getValue() + ": "
+ + fieldBoost, Field.Store.YES, Field.Index.NO));
+ }
+ else if (type == FieldType.COMPUTATION) {
+ comps.add(field.getValue());
+ }
+ }
+
+ // set the boost for the document and save it in the index
+ doc.setBoost(boost);
+ doc.add(new Field(Fields.BOOST, Float.toString(boost), Field.Store.YES,
+ Field.Index.NO));
+
+
+ return doc;
+ }
+
+}
Modified: lucene/nutch/trunk/src/plugin/nutch-extensionpoints/plugin.xml
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/nutch-extensionpoints/plugin.xml?rev=723447&r1=723446&r2=723447&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/nutch-extensionpoints/plugin.xml (original)
+++ lucene/nutch/trunk/src/plugin/nutch-extensionpoints/plugin.xml Thu Dec 4 13:29:21 2008
@@ -29,6 +29,10 @@
name="Nutch Online Search Results Clustering Plugin"/>
<extension-point
+ id="org.apache.nutch.indexer.field.FieldFilter"
+ name="Nutch Field Filter"/>
+
+<extension-point
id="org.apache.nutch.indexer.IndexingFilter"
name="Nutch Indexing Filter"/>
Added: lucene/nutch/trunk/src/plugin/query-custom/build.xml
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/query-custom/build.xml?rev=723447&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/query-custom/build.xml (added)
+++ lucene/nutch/trunk/src/plugin/query-custom/build.xml Thu Dec 4 13:29:21 2008
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="query-custom" default="jar-core">
+
+ <import file="../build-plugin.xml"/>
+
+</project>
Added: lucene/nutch/trunk/src/plugin/query-custom/plugin.xml
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/query-custom/plugin.xml?rev=723447&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/query-custom/plugin.xml (added)
+++ lucene/nutch/trunk/src/plugin/query-custom/plugin.xml Thu Dec 4 13:29:21 2008
@@ -0,0 +1,44 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+ id="query-custom"
+ name="Custom Query Filter"
+ version="1.0.0"
+ provider-name="nutch.org">
+
+ <runtime>
+ <library name="query-custom.jar">
+ <export name="*"/>
+ </library>
+ </runtime>
+
+ <requires>
+ <import plugin="nutch-extensionpoints"/>
+ </requires>
+
+ <extension id="org.apache.nutch.searcher.custom"
+ name="Nutch Custom Field Query Filter"
+ point="org.apache.nutch.searcher.QueryFilter">
+ <implementation id="CustomQueryFilter"
+ class="org.apache.nutch.searcher.custom.CustomFieldQueryFilter">
+ <parameter name="fields" value="lang" />
+ </implementation>
+
+ </extension>
+
+</plugin>
Added: lucene/nutch/trunk/src/plugin/query-custom/src/java/org/apache/nutch/searcher/custom/CustomFieldQueryFilter.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/query-custom/src/java/org/apache/nutch/searcher/custom/CustomFieldQueryFilter.java?rev=723447&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/query-custom/src/java/org/apache/nutch/searcher/custom/CustomFieldQueryFilter.java (added)
+++ lucene/nutch/trunk/src/plugin/query-custom/src/java/org/apache/nutch/searcher/custom/CustomFieldQueryFilter.java Thu Dec 4 13:29:21 2008
@@ -0,0 +1,125 @@
+package org.apache.nutch.searcher.custom;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Enumeration;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Properties;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.TermQuery;
+import org.apache.nutch.indexer.field.CustomFields;
+import org.apache.nutch.searcher.Query;
+import org.apache.nutch.searcher.QueryException;
+import org.apache.nutch.searcher.QueryFilter;
+import org.apache.nutch.searcher.Query.Clause;
+
+public class CustomFieldQueryFilter
+ implements QueryFilter {
+
+ public static final Log LOG = LogFactory.getLog(CustomFields.class);
+ private Configuration conf;
+ private List<String> fieldNames = new ArrayList<String>();
+ private Map<String, Float> boosts = new HashMap<String, Float>();
+
+ public CustomFieldQueryFilter() {
+ }
+
+ public void setConf(Configuration conf) {
+
+ try {
+ this.conf = conf;
+ FileSystem fs = FileSystem.get(conf);
+ String configFile = conf.get("custom.fields.config", "custom-fields.xml");
+ LOG.info("Reading configuration field configuration from " + configFile);
+ Properties customFieldProps = new Properties();
+ InputStream fis = CustomFields.class.getClassLoader().getResourceAsStream(
+ configFile);
+ if (fis == null) {
+ throw new IOException("Was unable to open " + configFile);
+ }
+ customFieldProps.loadFromXML(fis);
+ Enumeration keys = customFieldProps.keys();
+ while (keys.hasMoreElements()) {
+ String prop = (String)keys.nextElement();
+ if (prop.endsWith(".name")) {
+ String propName = prop.substring(0, prop.length() - 5);
+ String name = customFieldProps.getProperty(prop);
+ fieldNames.add(name);
+ String boostKey = propName + ".boost";
+ if (customFieldProps.containsKey(boostKey)) {
+ float boost = Float.parseFloat(customFieldProps.getProperty(boostKey));
+ boosts.put(name, boost);
+ }
+ }
+ }
+ }
+ catch (Exception e) {
+ LOG.error("Error loading custom field properties:\n"
+ + StringUtils.stringifyException(e));
+ }
+ }
+
+ public Configuration getConf() {
+ return this.conf;
+ }
+
+ public BooleanQuery filter(Query input, BooleanQuery output)
+ throws QueryException {
+
+ // examine each clause in the Nutch query
+ Clause[] clauses = input.getClauses();
+ for (int i = 0; i < clauses.length; i++) {
+ Clause c = clauses[i];
+
+ // skip non-matching clauses
+ String fieldName = c.getField();
+ if (!fieldNames.contains(fieldName)) {
+ continue;
+ }
+ String value = c.getTerm().toString().toLowerCase();
+
+ // add a Lucene TermQuery for this clause
+ TermQuery clause = new TermQuery(new Term(fieldName, value));
+ // set boost
+ if (boosts.containsKey(fieldName)) {
+ clause.setBoost(boosts.get(fieldName));
+ }
+
+ // add it as specified in query
+ output.add(clause, (c.isProhibited() ? BooleanClause.Occur.MUST_NOT
+ : (c.isRequired() ? BooleanClause.Occur.MUST
+ : BooleanClause.Occur.SHOULD)));
+ }
+
+ // return the modified Lucene query
+ return output;
+ }
+}