You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2016/02/24 15:12:42 UTC
svn commit: r1732160 - in /nutch/trunk: CHANGES.txt
src/java/org/apache/nutch/crawl/DeduplicationJob.java
Author: markus
Date: Wed Feb 24 14:12:42 2016
New Revision: 1732160
URL: http://svn.apache.org/viewvc?rev=1732160&view=rev
Log:
NUTCH-2232 DeduplicationJob should decode URL's before length is compared
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java
Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1732160&r1=1732159&r2=1732160&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed Feb 24 14:12:42 2016
@@ -10,6 +10,8 @@ in the release announcement and keep it
Nutch Change Log
+* NUTCH-2232 DeduplicationJob should decode URL's before length is compared (Ron van der Vegt via markus)
+
* NUTCH-2229 Allow Jexl expressions on CrawlDatum's fixed attributes (markus)
* NUTCH-2227 RegexParseFilter (markus)
Modified: nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java?rev=1732160&r1=1732159&r2=1732160&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java Wed Feb 24 14:12:42 2016
@@ -17,6 +17,8 @@
package org.apache.nutch.crawl;
import java.io.IOException;
+import java.io.UnsupportedEncodingException;
+import java.net.URLDecoder;
import java.text.SimpleDateFormat;
import java.util.HashMap;
import java.util.Iterator;
@@ -193,8 +195,15 @@ public class DeduplicationJob extends Nu
break;
case "urlLength":
// same time? keep the one which has the shortest URL
- String urlExisting = existingDoc.getMetaData().get(urlKey).toString();
- String urlnewDoc = newDoc.getMetaData().get(urlKey).toString();
+ String urlExisting;
+ String urlnewDoc;
+ try {
+ urlExisting = URLDecoder.decode(existingDoc.getMetaData().get(urlKey).toString(), "UTF8");
+ urlnewDoc = URLDecoder.decode(newDoc.getMetaData().get(urlKey).toString(), "UTF8");
+ } catch (UnsupportedEncodingException e) {
+ LOG.error("Error decoding: " + urlKey);
+ throw new IOException("UnsupportedEncodingException for " + urlKey);
+ }
if (urlExisting.length() < urlnewDoc.length()) {
// mark new one as duplicate
writeOutAsDuplicate(newDoc, output, reporter);