You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by si...@apache.org on 2007/01/06 20:49:52 UTC

svn commit: r493548 - in /lucene/nutch/trunk: CHANGES.txt conf/nutch-default.xml src/java/org/apache/nutch/indexer/IndexingFilters.java src/test/org/apache/nutch/indexer/TestIndexingFilters.java

Author: siren
Date: Sat Jan  6 11:49:49 2007
New Revision: 493548

URL: http://svn.apache.org/viewvc?view=rev&rev=493548
Log:
fix NUTCH-421

Added:
    lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestIndexingFilters.java
Modified:
    lucene/nutch/trunk/CHANGES.txt
    lucene/nutch/trunk/conf/nutch-default.xml
    lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=493548&r1=493547&r2=493548
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Sat Jan  6 11:49:49 2007
@@ -119,6 +119,9 @@
 
 38. NUTCH-325 - UrlFilters.java throws NPE in case urlfilter.order contains
     Filters that are not in plugin.includes (Stefan Groschupf, siren)
+    
+39. NUTCH-421 - Allow predeterminate running order of indexing filters
+    (Alan Tanaman, siren)
 
 
 Release 0.8 - 2006-07-25

Modified: lucene/nutch/trunk/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/nutch-default.xml?view=diff&rev=493548&r1=493547&r2=493548
==============================================================================
--- lucene/nutch/trunk/conf/nutch-default.xml (original)
+++ lucene/nutch/trunk/conf/nutch-default.xml Sat Jan  6 11:49:49 2007
@@ -536,6 +536,24 @@
   </description>
 </property>
 
+<!-- indexingfilter plugin properties -->
+
+<property>
+  <name>indexingfilter.order</name>
+  <value></value>
+  <description>The order by which index filters are applied.
+  If empty, all available index filters (as dictated by properties
+  plugin-includes and plugin-excludes above) are loaded and applied in system
+  defined order. If not empty, only named filters are loaded and applied
+  in given order. For example, if this property has value:
+  org.apache.nutch.indexer.basic.BasicIndexingFilter org.apache.nutch.indexer.more.MoreIndexingFilter
+  then BasicIndexingFilter is applied first, and MoreIndexingFilter second.
+  
+  Filter ordering might have impact on result if one filter depends on output of
+  another filter.
+  </description>
+</property>
+
 
 <!-- analysis properties -->
 

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java?view=diff&rev=493548&r1=493547&r2=493548
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java Sat Jan  6 11:49:49 2007
@@ -17,6 +17,7 @@
 
 package org.apache.nutch.indexer;
 
+import java.util.ArrayList;
 import java.util.HashMap;
 
 // Commons Logging imports
@@ -35,13 +36,22 @@
 /** Creates and caches {@link IndexingFilter} implementing plugins.*/
 public class IndexingFilters {
 
+  public static final String INDEXINGFILTER_ORDER = "indexingfilter.order";
+
   public final static Log LOG = LogFactory.getLog(IndexingFilters.class);
 
   private IndexingFilter[] indexingFilters;
 
   public IndexingFilters(Configuration conf) {
+      /* Get indexingfilter.order property */
+      String order = conf.get(INDEXINGFILTER_ORDER);
       this.indexingFilters =(IndexingFilter[]) conf.getObject(IndexingFilter.class.getName()); 
       if (this.indexingFilters == null) {
+          /* If ordered filters are required, prepare array of filters based on property */
+          String[] orderedFilters = null;
+          if (order != null && !order.trim().equals("")) {
+              orderedFilters = order.split("\\s+");
+          }
             try {
                 ExtensionPoint point = PluginRepository.get(conf).getExtensionPoint(IndexingFilter.X_POINT_ID);
                 if (point == null)
@@ -58,7 +68,21 @@
                         filterMap.put(filter.getClass().getName(), filter);
                     }
                 }
-                conf.setObject(IndexingFilter.class.getName(), (IndexingFilter[]) filterMap.values().toArray(new IndexingFilter[0]));
+                /* If no ordered filters required, just get the filters in an indeterminate order */
+                if (orderedFilters == null) {
+                    conf.setObject(IndexingFilter.class.getName(), (IndexingFilter[]) filterMap.values().toArray(new IndexingFilter[0]));
+                /* Otherwise run the filters in the required order */
+                } else {
+                    ArrayList<IndexingFilter> filters = new ArrayList<IndexingFilter>();
+                    for (int i = 0; i < orderedFilters.length; i++) {
+                        IndexingFilter filter = (IndexingFilter) filterMap
+                                .get(orderedFilters[i]);
+                        if (filter != null) {
+                          filters.add(filter);
+                        }
+                    }
+                    conf.setObject(IndexingFilter.class.getName(), filters.toArray(new IndexingFilter[filters.size()]));
+                }
             } catch (PluginRuntimeException e) {
                 throw new RuntimeException(e);
             }

Added: lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestIndexingFilters.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestIndexingFilters.java?view=auto&rev=493548
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestIndexingFilters.java (added)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestIndexingFilters.java Sat Jan  6 11:49:49 2007
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.lucene.document.Document;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.util.NutchConfiguration;
+
+import junit.framework.TestCase;
+
+public class TestIndexingFilters extends TestCase {
+
+  /**
+   * Test behaviour when defined filter does not exist.
+   * @throws IndexingException
+   */
+  public void testNonExistingIndexingFilter() throws IndexingException {
+    Configuration conf = NutchConfiguration.create();
+    String class1 = "NonExistingFilter";
+    String class2 = "org.apache.nutch.indexer.basic.BasicIndexingFilter";
+    conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1 + " " + class2);
+
+    IndexingFilters filters = new IndexingFilters(conf);
+    filters.filter(new Document(), new ParseImpl("text", new ParseData(
+        new ParseStatus(), "title", new Outlink[0], new Metadata())), new Text(
+        "http://www.example.com/"), new CrawlDatum(), new Inlinks());
+  }
+
+}