You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2016/02/24 15:12:42 UTC
svn commit: r1732160 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/crawl/DeduplicationJob.java

Author: markus
Date: Wed Feb 24 14:12:42 2016
New Revision: 1732160

URL: http://svn.apache.org/viewvc?rev=1732160&view=rev
Log:
NUTCH-2232 DeduplicationJob should decode URL's before length is compared

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1732160&r1=1732159&r2=1732160&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed Feb 24 14:12:42 2016
@@ -10,6 +10,8 @@ in the release announcement and keep it
 
 Nutch Change Log
 
+* NUTCH-2232 DeduplicationJob should decode URL's before length is compared (Ron van der Vegt via markus)
+
 * NUTCH-2229 Allow Jexl expressions on CrawlDatum's fixed attributes (markus)
 
 * NUTCH-2227 RegexParseFilter (markus)

Modified: nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java?rev=1732160&r1=1732159&r2=1732160&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java Wed Feb 24 14:12:42 2016
@@ -17,6 +17,8 @@
 package org.apache.nutch.crawl;
 
 import java.io.IOException;
+import java.io.UnsupportedEncodingException;
+import java.net.URLDecoder;
 import java.text.SimpleDateFormat;
 import java.util.HashMap;
 import java.util.Iterator;
@@ -193,8 +195,15 @@ public class DeduplicationJob extends Nu
               break;
             case "urlLength":
               // same time? keep the one which has the shortest URL
-              String urlExisting = existingDoc.getMetaData().get(urlKey).toString();
-              String urlnewDoc = newDoc.getMetaData().get(urlKey).toString();
+              String urlExisting;
+              String urlnewDoc;
+              try {
+                urlExisting = URLDecoder.decode(existingDoc.getMetaData().get(urlKey).toString(), "UTF8");
+                urlnewDoc = URLDecoder.decode(newDoc.getMetaData().get(urlKey).toString(), "UTF8");
+              } catch (UnsupportedEncodingException e) {
+                LOG.error("Error decoding: " + urlKey);
+                throw new IOException("UnsupportedEncodingException for " + urlKey);
+              }
               if (urlExisting.length() < urlnewDoc.length()) {
                 // mark new one as duplicate
                 writeOutAsDuplicate(newDoc, output, reporter);