You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2012/06/12 13:26:28 UTC

svn commit: r1349262 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/indexer/IndexerMapReduce.java src/java/org/apache/nutch/indexer/solr/SolrIndexer.java src/java/org/apache/nutch/net/URLNormalizers.java

Author: markus
Date: Tue Jun 12 11:26:28 2012
New Revision: 1349262

URL: http://svn.apache.org/viewvc?rev=1349262&view=rev
Log:
NUTCH-1300 Indexer to filter normalize URL's

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
    nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrIndexer.java
    nutch/trunk/src/java/org/apache/nutch/net/URLNormalizers.java

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1349262&r1=1349261&r2=1349262&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue Jun 12 11:26:28 2012
@@ -2,6 +2,8 @@ Nutch Change Log
 
 (trunk) Current Development:
 
+* NUTCH-1300 Indexer to filter normalize URL's (markus)
+
 * NUTCH-1330 WebGraph OutlinkDB to preserve back up (markus)
 
 * NUTCH-1319 HostNormalizer plugin (markus)

Modified: nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java?rev=1349262&r1=1349261&r2=1349262&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java Tue Jun 12 11:26:28 2012
@@ -41,6 +41,8 @@ import org.apache.nutch.crawl.LinkDb;
 import org.apache.nutch.crawl.NutchWritable;
 import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.net.URLFilters;
+import org.apache.nutch.net.URLNormalizers;
 import org.apache.nutch.parse.Parse;
 import org.apache.nutch.parse.ParseData;
 import org.apache.nutch.parse.ParseImpl;
@@ -56,11 +58,21 @@ implements Mapper<Text, Writable, Text, 
 
   public static final String INDEXER_DELETE = "indexer.delete";
   public static final String INDEXER_SKIP_NOTMODIFIED = "indexer.skip.notmodified";
+  public static final String URL_FILTERING = "indexer.url.filters";
+  public static final String URL_NORMALIZING = "indexer.url.normalizers";
 
   private boolean skip = false;
   private boolean delete = false;
   private IndexingFilters filters;
   private ScoringFilters scfilters;
+  
+  // using normalizers and/or filters
+  private boolean normalize = false;
+  private boolean filter = false;
+
+  // url normalizers, filters and job configuration
+  private URLNormalizers urlNormalizers;
+  private URLFilters urlFilters;
 
   public void configure(JobConf job) {
     setConf(job);
@@ -68,10 +80,80 @@ implements Mapper<Text, Writable, Text, 
     this.scfilters = new ScoringFilters(getConf());
     this.delete = job.getBoolean(INDEXER_DELETE, false);
     this.skip = job.getBoolean(INDEXER_SKIP_NOTMODIFIED, false);
+
+    normalize = job.getBoolean(URL_NORMALIZING, false);
+    filter = job.getBoolean(URL_FILTERING, false);
+
+    if (normalize) {
+      urlNormalizers = new URLNormalizers(getConf(), URLNormalizers.SCOPE_DEFAULT);
+    }
+
+    if (filter) {
+      urlFilters = new URLFilters(getConf());
+    }
+  }
+
+  /**
+   * Normalizes and trims extra whitespace from the given url.
+   *
+   * @param url The url to normalize.
+   *
+   * @return The normalized url.
+   */
+  private String normalizeUrl(String url) {
+    if (!normalize) {
+      return url;
+    }
+
+    String normalized = null;
+    if (urlNormalizers != null) {
+      try {
+
+        // normalize and trim the url
+        normalized = urlNormalizers.normalize(url,
+          URLNormalizers.SCOPE_INDEXER);
+        normalized = normalized.trim();
+      }
+      catch (Exception e) {
+        LOG.warn("Skipping " + url + ":" + e);
+        normalized = null;
+      }
+    }
+
+    return normalized;
+  }
+
+  /**
+   * Filters the given url.
+   *
+   * @param url The url to filter.
+   *
+   * @return The filtered url or null.
+   */
+  private String filterUrl(String url) {
+    if (!filter) {
+      return url;
+    }
+
+    try {
+      url = urlFilters.filter(url);
+    } catch (Exception e) {
+      url = null;
+    }
+
+    return url;
   }
 
   public void map(Text key, Writable value,
       OutputCollector<Text, NutchWritable> output, Reporter reporter) throws IOException {
+
+    String urlString = filterUrl(normalizeUrl(key.toString()));
+    if (urlString == null) {
+      return;
+    } else {
+      key.set(urlString);
+    }
+
     output.collect(key, new NutchWritable(value));
   }
 

Modified: nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrIndexer.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrIndexer.java?rev=1349262&r1=1349261&r2=1349262&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrIndexer.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrIndexer.java Tue Jun 12 11:26:28 2012
@@ -72,6 +72,13 @@ public class SolrIndexer extends Configu
   
   public void indexSolr(String solrUrl, Path crawlDb, Path linkDb,
       List<Path> segments, boolean noCommit, boolean deleteGone, String solrParams) throws IOException {
+    indexSolr(solrUrl, crawlDb, linkDb, segments, noCommit, deleteGone, solrParams, false, false);
+  }
+  
+  public void indexSolr(String solrUrl, Path crawlDb, Path linkDb,
+      List<Path> segments, boolean noCommit, boolean deleteGone, String solrParams,
+      boolean filter, boolean normalize) throws IOException {
+      
     SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
     long start = System.currentTimeMillis();
     LOG.info("SolrIndexer: starting at " + sdf.format(start));
@@ -79,14 +86,16 @@ public class SolrIndexer extends Configu
     final JobConf job = new NutchJob(getConf());
     job.setJobName("index-solr " + solrUrl);
 
-    if (deleteGone) {
-      LOG.info("SolrIndexer: deleting gone documents");
-    }
-
+    LOG.info("SolrIndexer: deleting gone documents: " + deleteGone);
+    LOG.info("SolrIndexer: URL filtering: " + filter);
+    LOG.info("SolrIndexer: URL normalizing: " + normalize);
+    
     IndexerMapReduce.initMRJob(crawlDb, linkDb, segments, job);
 
     job.set(SolrConstants.SERVER_URL, solrUrl);
     job.setBoolean(IndexerMapReduce.INDEXER_DELETE, deleteGone);
+    job.setBoolean(IndexerMapReduce.URL_FILTERING, filter);
+    job.setBoolean(IndexerMapReduce.URL_NORMALIZING, normalize);
     if (solrParams != null) {
       job.set(SolrConstants.PARAMS, solrParams);
     }
@@ -118,7 +127,7 @@ public class SolrIndexer extends Configu
 
   public int run(String[] args) throws Exception {
     if (args.length < 3) {
-      System.err.println("Usage: SolrIndexer <solr url> <crawldb> [-linkdb <linkdb>] [-params k1=v1&k2=v2...] (<segment> ... | -dir <segments>) [-noCommit] [-deleteGone]");
+      System.err.println("Usage: SolrIndexer <solr url> <crawldb> [-linkdb <linkdb>] [-params k1=v1&k2=v2...] (<segment> ... | -dir <segments>) [-noCommit] [-deleteGone] [-filter] [-normalize]");
       return -1;
     }
 
@@ -130,6 +139,8 @@ public class SolrIndexer extends Configu
 
     boolean noCommit = false;
     boolean deleteGone = false;
+    boolean filter = false;
+    boolean normalize = false;
 
     for (int i = 2; i < args.length; i++) {
     	if (args[i].equals("-linkdb")) {
@@ -148,6 +159,10 @@ public class SolrIndexer extends Configu
         noCommit = true;
       } else if (args[i].equals("-deleteGone")) {
         deleteGone = true;
+      } else if (args[i].equals("-filter")) {
+        filter = true;
+      } else if (args[i].equals("-normalize")) {
+        normalize = true;
       } else if (args[i].equals("-params")) {
         params = args[++i];
       } else {
@@ -156,7 +171,7 @@ public class SolrIndexer extends Configu
     }
 
     try {
-      indexSolr(args[0], crawlDb, linkDb, segments, noCommit, deleteGone, params);
+      indexSolr(args[0], crawlDb, linkDb, segments, noCommit, deleteGone, params, filter, normalize);
       return 0;
     } catch (final Exception e) {
       LOG.error("SolrIndexer: " + StringUtils.stringifyException(e));

Modified: nutch/trunk/src/java/org/apache/nutch/net/URLNormalizers.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/net/URLNormalizers.java?rev=1349262&r1=1349261&r2=1349262&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/net/URLNormalizers.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/net/URLNormalizers.java Tue Jun 12 11:26:28 2012
@@ -95,7 +95,8 @@ public final class URLNormalizers {
   public static final String SCOPE_INJECT = "inject";
   /** Scope used when constructing new {@link org.apache.nutch.parse.Outlink} instances. */
   public static final String SCOPE_OUTLINK = "outlink";
-  
+  /** Scope used when indexing URLs. */
+  public static final String SCOPE_INDEXER = "indexer";
 
   public static final Logger LOG = LoggerFactory.getLogger(URLNormalizers.class);