You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2016/01/13 14:10:19 UTC
svn commit: r1724418 - in /nutch/trunk: CHANGES.txt
src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
Author: markus
Date: Wed Jan 13 13:10:19 2016
New Revision: 1724418
URL: http://svn.apache.org/viewvc?rev=1724418&view=rev
Log:
NUTCH-2196 IndexingFilterChecker to optionally normalize
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1724418&r1=1724417&r2=1724418&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed Jan 13 13:10:19 2016
@@ -1,5 +1,7 @@
Nutch Change Log
+* NUTCH-2196 IndexingFilterChecker to optionally normalize (markus)
+
* NUTCH-2195 IndexingFilterChecker to optionally follow N redirects (markus)
* NUTCH-2190 Protocol normalizer (markus)
Modified: nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java?rev=1724418&r1=1724417&r2=1724418&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java Wed Jan 13 13:10:19 2016
@@ -32,6 +32,7 @@ import org.apache.nutch.crawl.Inlinks;
import org.apache.nutch.crawl.SignatureFactory;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseResult;
import org.apache.nutch.parse.ParseSegment;
@@ -43,7 +44,6 @@ import org.apache.nutch.protocol.Protoco
import org.apache.nutch.scoring.ScoringFilters;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.StringUtil;
-import org.apache.nutch.util.URLUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -69,10 +69,11 @@ public class IndexingFiltersChecker exte
public int run(String[] args) throws Exception {
String contentType = null;
String url = null;
+ URLNormalizers normalizers = null;
boolean dumpText = false;
boolean followRedirects = false;
- String usage = "Usage: IndexingFiltersChecker [-followRedirects] [-dumpText] [-md key=value] <url>";
+ String usage = "Usage: IndexingFiltersChecker [-normalize] [-followRedirects] [-dumpText] [-md key=value] <url>";
if (args.length == 0) {
System.err.println(usage);
@@ -83,7 +84,9 @@ public class IndexingFiltersChecker exte
HashMap<String, String> metadata = new HashMap<String, String>();
for (int i = 0; i < args.length; i++) {
- if (args[i].equals("-followRedirects")) {
+ if (args[i].equals("-normalize")) {
+ normalizers = new URLNormalizers(getConf(), URLNormalizers.SCOPE_DEFAULT);
+ } else if (args[i].equals("-followRedirects")) {
followRedirects = true;
} else if (args[i].equals("-dumpText")) {
dumpText = true;
@@ -101,9 +104,13 @@ public class IndexingFiltersChecker exte
System.err.println(usage);
System.exit(-1);
} else {
- url = URLUtil.toASCII(args[i]);
+ url =args[i];
}
}
+
+ if (normalizers != null) {
+ url = normalizers.normalize(url, URLNormalizers.SCOPE_DEFAULT);
+ }
LOG.info("fetching: " + url);
@@ -129,6 +136,11 @@ public class IndexingFiltersChecker exte
while (!output.getStatus().isSuccess() && followRedirects && output.getStatus().isRedirect() && maxRedirects != 0) {
String[] stuff = output.getStatus().getArgs();
url = stuff[0];
+
+ if (normalizers != null) {
+ url = normalizers.normalize(url, URLNormalizers.SCOPE_DEFAULT);
+ }
+
turl.set(url);
// try again