You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2014/05/26 12:47:12 UTC
svn commit: r1597556 - in /nutch/trunk: CHANGES.txt conf/nutch-default.xml src/java/org/apache/nutch/crawl/CrawlDb.java

Author: markus
Date: Mon May 26 10:47:11 2014
New Revision: 1597556

URL: http://svn.apache.org/r1597556
Log:
NUTCH-1786 CrawlDb should follow db.url.normalizers and db.url.filters

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/conf/nutch-default.xml
    nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1597556&r1=1597555&r2=1597556&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Mon May 26 10:47:11 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Current Development
 
+* NUTCH-1786 CrawlDb should follow db.url.normalizers and db.url.filters (Diaa via markus)
+
 * NUTCH-1757 ParserChecker to take custom metadata as input (jnioche)
 
 * NUTCH-1676 Add rudimentary SSL support to protocol-http (jnioche, markus)

Modified: nutch/trunk/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1597556&r1=1597555&r2=1597556&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Mon May 26 10:47:11 2014
@@ -76,11 +76,11 @@
 
   NOTE: You should also check other related properties:
 
-	http.robots.agents
-	http.agent.description
-	http.agent.url
-	http.agent.email
-	http.agent.version
+    http.robots.agents
+    http.agent.description
+    http.agent.url
+    http.agent.email
+    http.agent.version
 
   and set their values appropriately.
 
@@ -345,7 +345,6 @@
 </property>
 
 <!-- web db properties -->
-
 <property>
   <name>db.fetch.interval.default</name>
   <value>2592000</value>
@@ -450,6 +449,18 @@
 </property>
 
 <property>
+    <name>db.url.normalizers</name>
+    <value>false</value>
+    <description>Normalize urls when updating crawldb</description>
+</property>
+
+<property>
+    <name>db.url.filters</name>
+    <value>false</value>
+    <description>Filter urls when updating crawldb</description>
+</property>
+
+<property>
   <name>db.update.max.inlinks</name>
   <value>10000</value>
   <description>Maximum number of inlinks to take into account when updating 

Modified: nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java?rev=1597556&r1=1597555&r2=1597556&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java Mon May 26 10:47:11 2014
@@ -114,7 +114,9 @@ public class CrawlDb extends Configured 
     long end = System.currentTimeMillis();
     LOG.info("CrawlDb update: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
   }
-
+/*
+ * Configure a new CrawlDb in a temp folder at crawlDb/<rand>
+ */
   public static JobConf createJob(Configuration config, Path crawlDb)
     throws IOException {
     Path newCrawlDb =
@@ -180,12 +182,11 @@ public class CrawlDb extends Configured 
 
       return -1;
     }
-    boolean normalize = false;
-    boolean filter = false;
+    boolean normalize = getConf().getBoolean(CrawlDbFilter.URL_NORMALIZING, false);
+    boolean filter = getConf().getBoolean(CrawlDbFilter.URL_FILTERING, false);
+    boolean additionsAllowed = getConf().getBoolean(CRAWLDB_ADDITIONS_ALLOWED, true);
     boolean force = false;
-    boolean url404Purging = false;
     final FileSystem fs = FileSystem.get(getConf());
-    boolean additionsAllowed = getConf().getBoolean(CRAWLDB_ADDITIONS_ALLOWED, true);
     HashSet<Path> dirs = new HashSet<Path>();
     for (int i = 1; i < args.length; i++) {
       if (args[i].equals("-normalize")) {