You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2012/01/31 16:24:37 UTC
svn commit: r1238663 - in /nutch/trunk: CHANGES.txt conf/nutch-default.xml
src/java/org/apache/nutch/parse/ParseOutputFormat.java
src/java/org/apache/nutch/parse/ParseSegment.java
Author: markus
Date: Tue Jan 31 15:24:37 2012
New Revision: 1238663
URL: http://svn.apache.org/viewvc?rev=1238663&view=rev
Log:
NUTCH-1242 Allow disabling of URL Filters in ParseSegment
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/conf/nutch-default.xml
nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java
nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java
Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1238663&r1=1238662&r2=1238663&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue Jan 31 15:24:37 2012
@@ -1,5 +1,7 @@
Nutch Change Log
+* NUTCH-1242 Allow disabling of URL Filters in ParseSegment (Edward Drapkin via markus)
+
* NUTCH-1256 WebGraph to dump host + score (markus)
* NUTCH-1260 Fetcher should log fetching of redirects (Sebastian Nagel via markus)
Modified: nutch/trunk/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1238663&r1=1238662&r2=1238663&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Tue Jan 31 15:24:37 2012
@@ -1050,6 +1050,18 @@
</description>
</property>
+<property>
+ <name>parse.filter.urls</name>
+ <value>true</value>
+ <description>Whether the parser will filter URLs (with the configured URL filters).</description>
+</property>
+
+<property>
+ <name>parse.normalize.urls</name>
+ <value>true</value>
+ <description>Whether the parser will normalize URLs (with the configured URL normalizers).</description>
+</property>
+
<!-- urlfilter plugin properties -->
<property>
Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java?rev=1238663&r1=1238662&r2=1238663&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java Tue Jan 31 15:24:37 2012
@@ -92,8 +92,13 @@ public class ParseOutputFormat implement
public RecordWriter<Text, Parse> getRecordWriter(FileSystem fs, JobConf job,
String name, Progressable progress) throws IOException {
- filters = new URLFilters(job);
- normalizers = new URLNormalizers(job, URLNormalizers.SCOPE_OUTLINK);
+ if(job.getBoolean("parse.filter.urls", true)) {
+ filters = new URLFilters(job);
+ }
+
+ if(job.getBoolean("parse.normalize.urls", true)) {
+ normalizers = new URLNormalizers(job, URLNormalizers.SCOPE_OUTLINK);
+ }
this.scfilters = new ScoringFilters(job);
final int interval = job.getInt("db.fetch.interval.default", 2592000);
@@ -166,13 +171,20 @@ public class ParseOutputFormat implement
pstatus.getMinorCode() == ParseStatus.SUCCESS_REDIRECT) {
String newUrl = pstatus.getMessage();
int refreshTime = Integer.valueOf(pstatus.getArgs()[1]);
+
try {
- newUrl = normalizers.normalize(newUrl,
- URLNormalizers.SCOPE_FETCHER);
+ if(normalizers != null) {
+ newUrl = normalizers.normalize(newUrl,
+ URLNormalizers.SCOPE_FETCHER);
+ }
} catch (MalformedURLException mfue) {
newUrl = null;
}
- if (newUrl != null) newUrl = filters.filter(newUrl);
+
+ if (filters != null) {
+ if (newUrl != null) newUrl = filters.filter(newUrl);
+ }
+
String url = key.toString();
if (newUrl != null && !newUrl.equals(url)) {
String reprUrl =
@@ -295,9 +307,13 @@ public class ParseOutputFormat implement
}
}
try {
- toUrl = normalizers.normalize(toUrl,
+ if(normalizers != null) {
+ toUrl = normalizers.normalize(toUrl,
URLNormalizers.SCOPE_OUTLINK); // normalize the url
- toUrl = filters.filter(toUrl); // filter the url
+ }
+ if (filters != null) {
+ toUrl = filters.filter(toUrl); // filter the url
+ }
if (toUrl == null) {
return null;
}
Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java?rev=1238663&r1=1238662&r2=1238663&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java Tue Jan 31 15:24:37 2012
@@ -172,12 +172,25 @@ public class ParseSegment extends Config
public int run(String[] args) throws Exception {
Path segment;
- String usage = "Usage: ParseSegment segment";
+ String usage = "Usage: ParseSegment segment [-noFilter] [-noNormalize]";
if (args.length == 0) {
System.err.println(usage);
System.exit(-1);
- }
+ }
+
+ if(args.length > 1) {
+ for(int i = 1; i < args.length; i++) {
+ String param = args[i];
+
+ if("-noilter".equalsIgnoreCase(param)) {
+ getConf().setBoolean("parse.filter.urls", false);
+ } else if ("-nonormalize".equalsIgnoreCase(param)) {
+ getConf().setBoolean("parse.normalize.urls", false);
+ }
+ }
+ }
+
segment = new Path(args[0]);
parse(segment);
return 0;