You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by si...@apache.org on 2007/01/06 20:49:52 UTC
svn commit: r493548 - in /lucene/nutch/trunk: CHANGES.txt
conf/nutch-default.xml src/java/org/apache/nutch/indexer/IndexingFilters.java
src/test/org/apache/nutch/indexer/TestIndexingFilters.java
Author: siren
Date: Sat Jan 6 11:49:49 2007
New Revision: 493548
URL: http://svn.apache.org/viewvc?view=rev&rev=493548
Log:
fix NUTCH-421
Added:
lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestIndexingFilters.java
Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/conf/nutch-default.xml
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java
Modified: lucene/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=493548&r1=493547&r2=493548
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Sat Jan 6 11:49:49 2007
@@ -119,6 +119,9 @@
38. NUTCH-325 - UrlFilters.java throws NPE in case urlfilter.order contains
Filters that are not in plugin.includes (Stefan Groschupf, siren)
+
+39. NUTCH-421 - Allow predeterminate running order of indexing filters
+ (Alan Tanaman, siren)
Release 0.8 - 2006-07-25
Modified: lucene/nutch/trunk/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/nutch-default.xml?view=diff&rev=493548&r1=493547&r2=493548
==============================================================================
--- lucene/nutch/trunk/conf/nutch-default.xml (original)
+++ lucene/nutch/trunk/conf/nutch-default.xml Sat Jan 6 11:49:49 2007
@@ -536,6 +536,24 @@
</description>
</property>
+<!-- indexingfilter plugin properties -->
+
+<property>
+ <name>indexingfilter.order</name>
+ <value></value>
+ <description>The order by which index filters are applied.
+ If empty, all available index filters (as dictated by properties
+ plugin-includes and plugin-excludes above) are loaded and applied in system
+ defined order. If not empty, only named filters are loaded and applied
+ in given order. For example, if this property has value:
+ org.apache.nutch.indexer.basic.BasicIndexingFilter org.apache.nutch.indexer.more.MoreIndexingFilter
+ then BasicIndexingFilter is applied first, and MoreIndexingFilter second.
+
+ Filter ordering might have impact on result if one filter depends on output of
+ another filter.
+ </description>
+</property>
+
<!-- analysis properties -->
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java?view=diff&rev=493548&r1=493547&r2=493548
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java Sat Jan 6 11:49:49 2007
@@ -17,6 +17,7 @@
package org.apache.nutch.indexer;
+import java.util.ArrayList;
import java.util.HashMap;
// Commons Logging imports
@@ -35,13 +36,22 @@
/** Creates and caches {@link IndexingFilter} implementing plugins.*/
public class IndexingFilters {
+ public static final String INDEXINGFILTER_ORDER = "indexingfilter.order";
+
public final static Log LOG = LogFactory.getLog(IndexingFilters.class);
private IndexingFilter[] indexingFilters;
public IndexingFilters(Configuration conf) {
+ /* Get indexingfilter.order property */
+ String order = conf.get(INDEXINGFILTER_ORDER);
this.indexingFilters =(IndexingFilter[]) conf.getObject(IndexingFilter.class.getName());
if (this.indexingFilters == null) {
+ /* If ordered filters are required, prepare array of filters based on property */
+ String[] orderedFilters = null;
+ if (order != null && !order.trim().equals("")) {
+ orderedFilters = order.split("\\s+");
+ }
try {
ExtensionPoint point = PluginRepository.get(conf).getExtensionPoint(IndexingFilter.X_POINT_ID);
if (point == null)
@@ -58,7 +68,21 @@
filterMap.put(filter.getClass().getName(), filter);
}
}
- conf.setObject(IndexingFilter.class.getName(), (IndexingFilter[]) filterMap.values().toArray(new IndexingFilter[0]));
+ /* If no ordered filters required, just get the filters in an indeterminate order */
+ if (orderedFilters == null) {
+ conf.setObject(IndexingFilter.class.getName(), (IndexingFilter[]) filterMap.values().toArray(new IndexingFilter[0]));
+ /* Otherwise run the filters in the required order */
+ } else {
+ ArrayList<IndexingFilter> filters = new ArrayList<IndexingFilter>();
+ for (int i = 0; i < orderedFilters.length; i++) {
+ IndexingFilter filter = (IndexingFilter) filterMap
+ .get(orderedFilters[i]);
+ if (filter != null) {
+ filters.add(filter);
+ }
+ }
+ conf.setObject(IndexingFilter.class.getName(), filters.toArray(new IndexingFilter[filters.size()]));
+ }
} catch (PluginRuntimeException e) {
throw new RuntimeException(e);
}
Added: lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestIndexingFilters.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestIndexingFilters.java?view=auto&rev=493548
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestIndexingFilters.java (added)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestIndexingFilters.java Sat Jan 6 11:49:49 2007
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.lucene.document.Document;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.util.NutchConfiguration;
+
+import junit.framework.TestCase;
+
+public class TestIndexingFilters extends TestCase {
+
+ /**
+ * Test behaviour when defined filter does not exist.
+ * @throws IndexingException
+ */
+ public void testNonExistingIndexingFilter() throws IndexingException {
+ Configuration conf = NutchConfiguration.create();
+ String class1 = "NonExistingFilter";
+ String class2 = "org.apache.nutch.indexer.basic.BasicIndexingFilter";
+ conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1 + " " + class2);
+
+ IndexingFilters filters = new IndexingFilters(conf);
+ filters.filter(new Document(), new ParseImpl("text", new ParseData(
+ new ParseStatus(), "title", new Outlink[0], new Metadata())), new Text(
+ "http://www.example.com/"), new CrawlDatum(), new Inlinks());
+ }
+
+}