You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2012/01/31 16:24:37 UTC

svn commit: r1238663 - in /nutch/trunk: CHANGES.txt conf/nutch-default.xml src/java/org/apache/nutch/parse/ParseOutputFormat.java src/java/org/apache/nutch/parse/ParseSegment.java

Author: markus
Date: Tue Jan 31 15:24:37 2012
New Revision: 1238663

URL: http://svn.apache.org/viewvc?rev=1238663&view=rev
Log:
NUTCH-1242 Allow disabling of URL Filters in ParseSegment

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/conf/nutch-default.xml
    nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java
    nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1238663&r1=1238662&r2=1238663&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue Jan 31 15:24:37 2012
@@ -1,5 +1,7 @@
 Nutch Change Log
 
+* NUTCH-1242 Allow disabling of URL Filters in ParseSegment (Edward Drapkin via markus)
+
 * NUTCH-1256 WebGraph to dump host + score (markus)
 
 * NUTCH-1260 Fetcher should log fetching of redirects (Sebastian Nagel via markus)

Modified: nutch/trunk/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1238663&r1=1238662&r2=1238663&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Tue Jan 31 15:24:37 2012
@@ -1050,6 +1050,18 @@
   </description>
 </property>
 
+<property>
+  <name>parse.filter.urls</name>
+  <value>true</value>
+  <description>Whether the parser will filter URLs (with the configured URL filters).</description>
+</property>
+
+<property>
+  <name>parse.normalize.urls</name>
+  <value>true</value>
+  <description>Whether the parser will normalize URLs (with the configured URL normalizers).</description>
+</property>
+
 <!-- urlfilter plugin properties -->
 
 <property>

Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java?rev=1238663&r1=1238662&r2=1238663&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java Tue Jan 31 15:24:37 2012
@@ -92,8 +92,13 @@ public class ParseOutputFormat implement
   public RecordWriter<Text, Parse> getRecordWriter(FileSystem fs, JobConf job,
                                       String name, Progressable progress) throws IOException {
 
-    filters = new URLFilters(job);
-    normalizers = new URLNormalizers(job, URLNormalizers.SCOPE_OUTLINK);
+    if(job.getBoolean("parse.filter.urls", true)) {
+      filters = new URLFilters(job);
+    }
+
+    if(job.getBoolean("parse.normalize.urls", true)) {
+      normalizers = new URLNormalizers(job, URLNormalizers.SCOPE_OUTLINK);
+    }
 
     this.scfilters = new ScoringFilters(job);
     final int interval = job.getInt("db.fetch.interval.default", 2592000);
@@ -166,13 +171,20 @@ public class ParseOutputFormat implement
                 pstatus.getMinorCode() == ParseStatus.SUCCESS_REDIRECT) {
               String newUrl = pstatus.getMessage();
               int refreshTime = Integer.valueOf(pstatus.getArgs()[1]);
+
               try {
-                newUrl = normalizers.normalize(newUrl,
-                    URLNormalizers.SCOPE_FETCHER);
+                if(normalizers != null) {
+                    newUrl = normalizers.normalize(newUrl,
+                        URLNormalizers.SCOPE_FETCHER);
+                }
               } catch (MalformedURLException mfue) {
                 newUrl = null;
               }
-              if (newUrl != null) newUrl = filters.filter(newUrl);
+
+              if (filters != null) {
+                if (newUrl != null) newUrl = filters.filter(newUrl);
+              }
+
               String url = key.toString();
               if (newUrl != null && !newUrl.equals(url)) {
                 String reprUrl =
@@ -295,9 +307,13 @@ public class ParseOutputFormat implement
       }
     }
     try {
-      toUrl = normalizers.normalize(toUrl,
+      if(normalizers != null) {
+        toUrl = normalizers.normalize(toUrl,
                   URLNormalizers.SCOPE_OUTLINK); // normalize the url
-      toUrl = filters.filter(toUrl);   // filter the url
+      }
+      if (filters != null) {
+        toUrl = filters.filter(toUrl);   // filter the url
+      }
       if (toUrl == null) {
         return null;
       }

Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java?rev=1238663&r1=1238662&r2=1238663&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java Tue Jan 31 15:24:37 2012
@@ -172,12 +172,25 @@ public class ParseSegment extends Config
   public int run(String[] args) throws Exception {
     Path segment;
 
-    String usage = "Usage: ParseSegment segment";
+    String usage = "Usage: ParseSegment segment [-noFilter] [-noNormalize]";
 
     if (args.length == 0) {
       System.err.println(usage);
       System.exit(-1);
-    }      
+    }
+
+    if(args.length > 1) {
+      for(int i = 1; i < args.length; i++) {
+        String param = args[i];
+
+        if("-noilter".equalsIgnoreCase(param)) {
+          getConf().setBoolean("parse.filter.urls", false);
+        } else if ("-nonormalize".equalsIgnoreCase(param)) {
+          getConf().setBoolean("parse.normalize.urls", false);
+        }
+      }
+    }
+
     segment = new Path(args[0]);
     parse(segment);
     return 0;