You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by fe...@apache.org on 2012/05/14 16:22:37 UTC

svn commit: r1338217 - in /nutch/branches/nutchgora: CHANGES.txt src/java/org/apache/nutch/indexer/IndexUtil.java src/java/org/apache/nutch/indexer/IndexerJob.java src/java/org/apache/nutch/indexer/IndexerReducer.java

Author: ferdy
Date: Mon May 14 14:22:37 2012
New Revision: 1338217

URL: http://svn.apache.org/viewvc?rev=1338217&view=rev
Log:
NUTCH-1366 speed up indexing by eliminating the indexreducer

Added:
    nutch/branches/nutchgora/src/java/org/apache/nutch/indexer/IndexUtil.java   (with props)
Removed:
    nutch/branches/nutchgora/src/java/org/apache/nutch/indexer/IndexerReducer.java
Modified:
    nutch/branches/nutchgora/CHANGES.txt
    nutch/branches/nutchgora/src/java/org/apache/nutch/indexer/IndexerJob.java

Modified: nutch/branches/nutchgora/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/nutchgora/CHANGES.txt?rev=1338217&r1=1338216&r2=1338217&view=diff
==============================================================================
--- nutch/branches/nutchgora/CHANGES.txt (original)
+++ nutch/branches/nutchgora/CHANGES.txt Mon May 14 14:22:37 2012
@@ -1,6 +1,8 @@
 Nutch Change Log
 
 Release nutchgora - Current Development
+* NUTCH-1366 speed up indexing by eliminating the indexreducer (ferdy)
+
 * NUTCH-1362 Fix error handling of urls with empty fields (lewis, ferdy)
 
 * NUTCH-1026 Strip UTF-8 non-character codepoints (markus, ferdy)

Added: nutch/branches/nutchgora/src/java/org/apache/nutch/indexer/IndexUtil.java
URL: http://svn.apache.org/viewvc/nutch/branches/nutchgora/src/java/org/apache/nutch/indexer/IndexUtil.java?rev=1338217&view=auto
==============================================================================
--- nutch/branches/nutchgora/src/java/org/apache/nutch/indexer/IndexUtil.java (added)
+++ nutch/branches/nutchgora/src/java/org/apache/nutch/indexer/IndexUtil.java Mon May 14 14:22:37 2012
@@ -0,0 +1,89 @@
+/*******************************************************************************
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ ******************************************************************************/
+package org.apache.nutch.indexer;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.scoring.ScoringFilterException;
+import org.apache.nutch.scoring.ScoringFilters;
+import org.apache.nutch.storage.WebPage;
+import org.apache.nutch.util.StringUtil;
+import org.apache.nutch.util.TableUtil;
+
+/**
+ * Utility to create an indexed document from a webpage.  
+ *
+ */
+public class IndexUtil {
+  private static final Log LOG = LogFactory.getLog(new Object() {
+  }.getClass().getEnclosingClass());
+  
+  
+  private IndexingFilters filters;
+  private ScoringFilters scoringFilters;
+  
+  public IndexUtil(Configuration conf) {
+    filters = new IndexingFilters(conf);
+    scoringFilters = new ScoringFilters(conf);
+  }
+  
+  /**
+   * Index a webpage.
+   * 
+   * @param key The key of the page (reversed url).
+   * @param page The webpage.
+   * @return The indexed document, or null if skipped by index filters.
+   */
+  public NutchDocument index(String key, WebPage page) {
+    NutchDocument doc = new NutchDocument();
+    doc.add("id", key);
+    doc.add("digest", StringUtil.toHexString(page.getSignature().array()));
+
+    String url = TableUtil.unreverseUrl(key);
+
+    if (LOG.isDebugEnabled()) {
+      LOG.debug("Indexing URL: " + url);
+    }
+
+    try {
+      doc = filters.filter(doc, url, page);
+    } catch (IndexingException e) {
+      LOG.warn("Error indexing "+key+": "+e);
+      return null;
+    }
+
+    // skip documents discarded by indexing filters
+    if (doc == null) return null;
+
+    float boost = 1.0f;
+    // run scoring filters
+    try {
+      boost = scoringFilters.indexerScore(url, doc, page, boost);
+    } catch (final ScoringFilterException e) {
+      LOG.warn("Error calculating score " + key + ": " + e);
+      return null;
+    }
+
+    doc.setScore(boost);
+    // store boost for use by explain and dedup
+    doc.add("boost", Float.toString(boost));
+
+    return doc;
+  }
+  
+}

Propchange: nutch/branches/nutchgora/src/java/org/apache/nutch/indexer/IndexUtil.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Modified: nutch/branches/nutchgora/src/java/org/apache/nutch/indexer/IndexerJob.java
URL: http://svn.apache.org/viewvc/nutch/branches/nutchgora/src/java/org/apache/nutch/indexer/IndexerJob.java?rev=1338217&r1=1338216&r2=1338217&view=diff
==============================================================================
--- nutch/branches/nutchgora/src/java/org/apache/nutch/indexer/IndexerJob.java (original)
+++ nutch/branches/nutchgora/src/java/org/apache/nutch/indexer/IndexerJob.java Mon May 14 14:22:37 2012
@@ -41,6 +41,7 @@ import org.apache.nutch.util.NutchTool;
 import org.apache.nutch.util.TableUtil;
 import org.apache.gora.mapreduce.GoraMapper;
 import org.apache.gora.mapreduce.StringComparator;
+import org.apache.gora.store.DataStore;
 
 public abstract class IndexerJob extends NutchTool implements Tool {
 
@@ -58,14 +59,27 @@ public abstract class IndexerJob extends
   }
   
   public static class IndexerMapper
-      extends GoraMapper<String, WebPage, String, WebPage> {
+      extends GoraMapper<String, WebPage, String, NutchDocument> {
+    public IndexUtil indexUtil;
+    public DataStore<String, WebPage> store;
+    
     protected Utf8 batchId;
 
     @Override
     public void setup(Context context) throws IOException {
       Configuration conf = context.getConfiguration();
       batchId = new Utf8(conf.get(GeneratorJob.BATCH_ID, Nutch.ALL_BATCH_ID_STR));
+      indexUtil = new IndexUtil(conf);
+      try {
+        store = StorageUtils.createWebStore(conf, String.class, WebPage.class);
+      } catch (ClassNotFoundException e) {
+        throw new IOException(e);
+      }
     }
+    
+    protected void cleanup(Context context) throws IOException ,InterruptedException {
+      store.close();
+    };
 
     @Override
     public void map(String key, WebPage page, Context context)
@@ -85,9 +99,17 @@ public abstract class IndexerJob extends
           return;
         }
       }
-
-      context.write(key, page);
-    }    
+      
+      NutchDocument doc = indexUtil.index(key, page);
+      if (doc == null) {
+        return;
+      }
+      if (mark != null) {
+        Mark.INDEX_MARK.putMark(page, Mark.UPDATEDB_MARK.checkMark(page));
+        store.put(key, page);
+      }
+      context.write(key, doc);
+    }
   }
 
 
@@ -110,9 +132,9 @@ public abstract class IndexerJob extends
         StringComparator.class, RawComparator.class);
 
     Collection<WebPage.Field> fields = getFields(job);
-    StorageUtils.initMapperJob(job, fields, String.class, WebPage.class,
+    StorageUtils.initMapperJob(job, fields, String.class, NutchDocument.class,
         IndexerMapper.class);
-    job.setReducerClass(IndexerReducer.class);
+    job.setNumReduceTasks(0);
     job.setOutputFormatClass(IndexerOutputFormat.class);
     return job;
   }