You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2012/06/07 20:48:58 UTC
svn commit: r1347755 - in /nutch/trunk: CHANGES.txt
src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
src/java/org/apache/nutch/parse/ParserChecker.java
src/java/org/apache/nutch/util/URLUtil.java
Author: markus
Date: Thu Jun 7 18:48:58 2012
New Revision: 1347755
URL: http://svn.apache.org/viewvc?rev=1347755&view=rev
Log:
NUTCH-1320 IndexChecker and ParseChecker choke on IDN's
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java
nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java
Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1347755&r1=1347754&r2=1347755&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Thu Jun 7 18:48:58 2012
@@ -2,6 +2,8 @@ Nutch Change Log
(trunk) Current Development:
+* NUTCH-1320 IndexChecker and ParseChecker choke on IDN's (markus)
+
* NUTCH-1351 DomainStatistics to aggregate by TLD (markus)
* NUTCH-1381 Allow to override default subcollection field name (markus)
Modified: nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java?rev=1347755&r1=1347754&r2=1347755&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java Thu Jun 7 18:48:58 2012
@@ -40,47 +40,47 @@ import org.apache.nutch.protocol.Content
import org.apache.nutch.protocol.Protocol;
import org.apache.nutch.protocol.ProtocolFactory;
import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.URLUtil;
/**
* Reads and parses a URL and run the indexers on it. Displays the fields obtained and the first
* 100 characters of their value
- *
+ *
* Tested with e.g. ./nutch org.apache.nutch.indexer.IndexingFiltersChecker http://www.lemonde.fr
* @author Julien Nioche
**/
public class IndexingFiltersChecker extends Configured implements Tool {
-
+
public static final Logger LOG = LoggerFactory.getLogger(IndexingFiltersChecker.class);
-
+
public IndexingFiltersChecker() {
}
-
+
public int run(String[] args) throws Exception {
-
String contentType = null;
String url = null;
-
+
String usage = "Usage: IndexingFiltersChecker <url>";
-
+
if (args.length != 1) {
System.err.println(usage);
System.exit(-1);
}
-
- url = args[0];
-
+
+ url = URLUtil.toASCII(args[0]);
+
if (LOG.isInfoEnabled()) {
LOG.info("fetching: " + url);
}
-
+
IndexingFilters indexers = new IndexingFilters(conf);
-
+
ProtocolFactory factory = new ProtocolFactory(conf);
Protocol protocol = factory.getProtocol(url);
CrawlDatum datum = new CrawlDatum();
-
+
Content content = protocol.getProtocolOutput(new Text(url), datum)
.getContent();
@@ -91,20 +91,20 @@ public class IndexingFiltersChecker exte
System.out.println("No content for " + url);
return 0;
}
-
+
contentType = content.getContentType();
-
+
if (contentType == null) {
return -1;
}
-
+
if (LOG.isInfoEnabled()) {
LOG.info("parsing: " + url);
LOG.info("contentType: " + contentType);
}
ParseResult parseResult = new ParseUtil(conf).parse(content);
-
+
NutchDocument doc = new NutchDocument();
Text urlText = new Text(url);
@@ -128,19 +128,19 @@ public class IndexingFiltersChecker exte
}
return 0;
}
-
+
public static void main(String[] args) throws Exception {
final int res = ToolRunner.run(NutchConfiguration.create(),
new IndexingFiltersChecker(), args);
System.exit(res);
}
-
+
Configuration conf;
-
+
public Configuration getConf() {
return conf;
}
-
+
@Override
public void setConf(Configuration arg0) {
conf = arg0;
Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java?rev=1347755&r1=1347754&r2=1347755&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java Thu Jun 7 18:48:58 2012
@@ -29,6 +29,7 @@ import org.apache.nutch.protocol.Content
import org.apache.nutch.protocol.Protocol;
import org.apache.nutch.protocol.ProtocolFactory;
import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.URLUtil;
import org.apache.nutch.util.StringUtil;
/**
@@ -69,7 +70,7 @@ public class ParserChecker implements To
System.err.println(usage);
System.exit(-1);
} else {
- url = args[i];
+ url = URLUtil.toASCII(args[i]);
}
}
Modified: nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java?rev=1347755&r1=1347754&r2=1347755&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java Thu Jun 7 18:48:58 2012
@@ -465,6 +465,43 @@ public class URLUtil {
}
}
+ public static String toASCII(String url) {
+ try {
+ URL u = new URL(url);
+ URI p = new URI(u.getProtocol(),
+ null,
+ IDN.toASCII(u.getHost()),
+ u.getPort(),
+ u.getPath(),
+ u.getQuery(),
+ u.getRef());
+
+ return p.toString();
+ }
+ catch (Exception e) {
+ return null;
+ }
+ }
+
+ public static String toUNICODE(String url) {
+ try {
+ URL u = new URL(url);
+ URI p = new URI(u.getProtocol(),
+ null,
+ IDN.toUnicode(u.getHost()),
+ u.getPort(),
+ u.getPath(),
+ u.getQuery(),
+ u.getRef());
+
+ return p.toString();
+ }
+ catch (Exception e) {
+ return null;
+ }
+ }
+
+
/** For testing */
public static void main(String[] args){