You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2022/01/17 18:57:03 UTC
[nutch] branch master updated: NUTCH-2935 DeduplicationJob: failure on URLs with invalid percent encoding - catch IllegalArgumentException when unescaping percent-encoding in URLs - if one URL of two compared URLs is valid, keep it as non-duplicate - add unit tests for DeduplicationJob

This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


The following commit(s) were added to refs/heads/master by this push:
     new d565f45  NUTCH-2935 DeduplicationJob: failure on URLs with invalid percent encoding - catch IllegalArgumentException when unescaping percent-encoding in URLs - if one URL of two compared URLs is valid, keep it as non-duplicate - add unit tests for DeduplicationJob
d565f45 is described below

commit d565f45a67d2491b7b536ae95560522aa20b8c26
Author: Sebastian Nagel <sn...@apache.org>
AuthorDate: Fri Jan 14 10:34:22 2022 +0100

    NUTCH-2935 DeduplicationJob: failure on URLs with invalid percent encoding
    - catch IllegalArgumentException when unescaping percent-encoding in URLs
    - if one URL of two compared URLs is valid, keep it as non-duplicate
    - add unit tests for DeduplicationJob
---
 .../org/apache/nutch/crawl/DeduplicationJob.java   |  60 ++++----
 .../nutch/crawl/TestCrawlDbDeduplication.java      | 163 +++++++++++++++++++++
 .../current/part-r-00000/.data.crc                 | Bin 0 -> 32 bytes
 .../current/part-r-00000/.index.crc                | Bin 0 -> 12 bytes
 .../current/part-r-00000/data                      | Bin 0 -> 2604 bytes
 .../current/part-r-00000/index                     | Bin 0 -> 233 bytes
 6 files changed, 196 insertions(+), 27 deletions(-)

diff --git a/src/java/org/apache/nutch/crawl/DeduplicationJob.java b/src/java/org/apache/nutch/crawl/DeduplicationJob.java
index 7751366..5f1172d 100644
--- a/src/java/org/apache/nutch/crawl/DeduplicationJob.java
+++ b/src/java/org/apache/nutch/crawl/DeduplicationJob.java
@@ -20,6 +20,7 @@ import java.io.IOException;
 import java.io.UnsupportedEncodingException;
 import java.lang.invoke.MethodHandles;
 import java.net.URLDecoder;
+import java.nio.charset.StandardCharsets;
 import java.text.SimpleDateFormat;
 import java.util.HashMap;
 import java.util.Map;
@@ -30,6 +31,7 @@ import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.BytesWritable;
 import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
 import org.apache.hadoop.mapreduce.Counter;
 import org.apache.hadoop.mapreduce.CounterGroup;
 import org.apache.hadoop.mapreduce.Job;
@@ -59,15 +61,16 @@ import org.slf4j.LoggerFactory;
  * with the latest timestamp is kept. If the documents have the same timestamp
  * then the one with the shortest URL is kept. The documents marked as duplicate
  * can then be deleted with the command CleaningJob.
- ***/
+ */
 public class DeduplicationJob extends NutchTool implements Tool {
 
   private static final Logger LOG = LoggerFactory
       .getLogger(MethodHandles.lookup().lookupClass());
 
-  private final static Text urlKey = new Text("_URLTEMPKEY_");
-  private final static String DEDUPLICATION_GROUP_MODE = "deduplication.group.mode";
-  private final static String DEDUPLICATION_COMPARE_ORDER = "deduplication.compare.order";
+  protected final static Text urlKey = new Text("_URLTEMPKEY_");
+  protected final static String DEDUPLICATION_GROUP_MODE = "deduplication.group.mode";
+  protected final static String DEDUPLICATION_COMPARE_ORDER = "deduplication.compare.order";
+  protected final static String UTF_8 = StandardCharsets.UTF_8.toString();
 
   public static class DBFilter extends
       Mapper<Text, CrawlDatum, BytesWritable, CrawlDatum> {
@@ -76,13 +79,12 @@ public class DeduplicationJob extends NutchTool implements Tool {
 
     @Override
     public void setup(Mapper<Text, CrawlDatum, BytesWritable, CrawlDatum>.Context context) {
-      Configuration arg0 = context.getConfiguration();
-      groupMode = arg0.get(DEDUPLICATION_GROUP_MODE);
+      Configuration conf = context.getConfiguration();
+      groupMode = conf.get(DEDUPLICATION_GROUP_MODE);
     }
 
     @Override
-    public void map(Text key, CrawlDatum value,
-        Context context)
+    public void map(Text key, CrawlDatum value, Context context)
         throws IOException, InterruptedException {
 
       if (value.getStatus() == CrawlDatum.STATUS_DB_FETCHED
@@ -121,18 +123,19 @@ public class DeduplicationJob extends NutchTool implements Tool {
     }
   }
 
-  public static class DedupReducer extends
-      Reducer<BytesWritable, CrawlDatum, Text, CrawlDatum> {
+  public static class DedupReducer<K extends Writable>
+      extends Reducer<K, CrawlDatum, Text, CrawlDatum> {
 
-    private String[] compareOrder;
+    protected String[] compareOrder;
     
     @Override
-    public void setup(Reducer<BytesWritable, CrawlDatum, Text, CrawlDatum>.Context context) {
+    public void setup(
+        Reducer<K, CrawlDatum, Text, CrawlDatum>.Context context) {
       Configuration conf = context.getConfiguration();
       compareOrder = conf.get(DEDUPLICATION_COMPARE_ORDER).split(",");
     }
 
-    private void writeOutAsDuplicate(CrawlDatum datum,
+    protected void writeOutAsDuplicate(CrawlDatum datum,
         Context context)
         throws IOException, InterruptedException {
       datum.setStatus(CrawlDatum.STATUS_DB_DUPLICATE);
@@ -143,8 +146,8 @@ public class DeduplicationJob extends NutchTool implements Tool {
     }
 
     @Override
-    public void reduce(BytesWritable key, Iterable<CrawlDatum> values,
-        Context context) throws IOException, InterruptedException {
+    public void reduce(K key, Iterable<CrawlDatum> values, Context context)
+        throws IOException, InterruptedException {
       CrawlDatum existingDoc = null;
 
       for (CrawlDatum newDoc : values) {
@@ -164,8 +167,7 @@ public class DeduplicationJob extends NutchTool implements Tool {
       }
     }
 
-    private CrawlDatum getDuplicate(CrawlDatum existingDoc, CrawlDatum newDoc)
-        throws IOException {
+    protected CrawlDatum getDuplicate(CrawlDatum existingDoc, CrawlDatum newDoc) {
       for (int i = 0; i < compareOrder.length; i++) {
         switch (compareOrder[i]) {
         case "score":
@@ -203,17 +205,21 @@ public class DeduplicationJob extends NutchTool implements Tool {
           }
           break;
         case "urlLength":
-          // same time? keep the one which has the shortest URL
-          String urlExisting;
-          String urlnewDoc;
+          // keep the one which has the shortest URL
+          // normalized by decoding percent-encoded sequences
+          String urlExisting = existingDoc.getMetaData().get(urlKey).toString();
+          String urlnewDoc = newDoc.getMetaData().get(urlKey).toString();
+          try {
+            urlExisting = URLDecoder.decode(urlExisting, UTF_8);
+          } catch (UnsupportedEncodingException | IllegalArgumentException e) {
+            LOG.error("Error decoding: {}", urlExisting, e);
+            // use the encoded URL
+          }
           try {
-            urlExisting = URLDecoder.decode(
-                existingDoc.getMetaData().get(urlKey).toString(), "UTF8");
-            urlnewDoc = URLDecoder
-                .decode(newDoc.getMetaData().get(urlKey).toString(), "UTF8");
-          } catch (UnsupportedEncodingException e) {
-            LOG.error("Error decoding: " + urlKey);
-            throw new IOException("UnsupportedEncodingException for " + urlKey);
+            urlnewDoc = URLDecoder.decode(urlnewDoc, UTF_8);
+          } catch (UnsupportedEncodingException | IllegalArgumentException e) {
+            LOG.error("Error decoding: {}", urlnewDoc, e);
+            // use the encoded URL
           }
           if (urlExisting.length() < urlnewDoc.length()) {
             // mark new one as duplicate
diff --git a/src/test/org/apache/nutch/crawl/TestCrawlDbDeduplication.java b/src/test/org/apache/nutch/crawl/TestCrawlDbDeduplication.java
new file mode 100644
index 0000000..25850a9
--- /dev/null
+++ b/src/test/org/apache/nutch/crawl/TestCrawlDbDeduplication.java
@@ -0,0 +1,163 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.crawl;
+
+import java.io.File;
+import java.io.IOException;
+import java.lang.invoke.MethodHandles;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class TestCrawlDbDeduplication {
+  private static final Logger LOG = LoggerFactory
+      .getLogger(MethodHandles.lookup().lookupClass());
+
+  Configuration conf;
+  FileSystem fs;
+  String testDir;
+  Path testCrawlDb;
+  CrawlDbReader reader;
+
+  @Before
+  public void setUp() throws Exception {
+    conf = NutchConfiguration.create();
+    fs = FileSystem.get(conf);
+    testDir = "test-crawldb-" + new java.util.Random().nextInt();
+    File sampleCrawlDb = new File(System.getProperty("test.build.data", "."),
+        "deduplication-crawldb");
+    LOG.info("Copying CrawlDb {} into test directory {}", sampleCrawlDb,
+        testDir);
+    FileUtils.copyDirectory(sampleCrawlDb, new File(testDir));
+    testCrawlDb = new Path(testDir);
+    for (FileStatus s : fs.listStatus(testCrawlDb)) {
+      LOG.info("{}", s);
+    }
+    reader = new CrawlDbReader();
+  }
+
+  @After
+  public void tearDown() {
+    try {
+      if (fs.exists(testCrawlDb))
+        fs.delete(testCrawlDb, true);
+    } catch (Exception e) {
+    }
+    try {
+      reader.close();
+    } catch (Exception e) {
+    }
+  }
+
+  @Test
+  public void testDeduplication() throws Exception {
+    String[] args = new String[3];
+    args[0] = testCrawlDb.toString();
+    args[1] = "-compareOrder";
+    args[2] = "fetchTime,urlLength,score";
+    int result = ToolRunner.run(conf, new DeduplicationJob(), args);
+    Assert.assertEquals("DeduplicationJob did not succeed", 0, result);
+    String url1 = "http://nutch.apache.org/";
+    String url2 = "https://nutch.apache.org/";
+    // url1 has been fetched earlier, so it should "survive" as "db_fetched":
+    checkStatus(url1, CrawlDatum.STATUS_DB_FETCHED);
+    checkStatus(url2, CrawlDatum.STATUS_DB_DUPLICATE);
+  }
+
+  @Test
+  public void testDeduplicationHttpsOverHttp() throws Exception {
+    String[] args = new String[3];
+    args[0] = testCrawlDb.toString();
+    args[1] = "-compareOrder";
+    args[2] = "httpsOverHttp,fetchTime,urlLength,score";
+    int result = ToolRunner.run(conf, new DeduplicationJob(), args);
+    Assert.assertEquals("DeduplicationJob did not succeed", 0, result);
+    String url1 = "http://nutch.apache.org/";
+    String url2 = "https://nutch.apache.org/";
+    // url2 is https://, so it should "survive" as "db_fetched":
+    checkStatus(url1, CrawlDatum.STATUS_DB_DUPLICATE);
+    checkStatus(url2, CrawlDatum.STATUS_DB_FETCHED);
+  }
+
+  private void checkStatus(String url, byte status) throws IOException {
+    CrawlDatum datum = reader.get(testCrawlDb.toString(), url, conf);
+    Assert.assertNotNull("No CrawlDatum found in CrawlDb for " + url, datum);
+    Assert.assertEquals(
+        "Expected status for " + url + ": " + CrawlDatum.getStatusName(status),
+        status, datum.getStatus());
+  }
+
+  static class TestDedupReducer extends DeduplicationJob.DedupReducer<Text> {
+
+    void setCompareOrder(String compareOrder) {
+      this.compareOrder = compareOrder.split(",");
+    }
+
+    String getDuplicate(String one, String two) {
+      CrawlDatum d1 = new CrawlDatum();
+      d1.getMetaData().put(DeduplicationJob.urlKey, new Text(one));
+      CrawlDatum d2 = new CrawlDatum();
+      d2.getMetaData().put(DeduplicationJob.urlKey, new Text(two));
+      CrawlDatum dup = getDuplicate(d1, d2);
+      if (dup == null) {
+        return null;
+      }
+      return dup.getMetaData().get(DeduplicationJob.urlKey).toString();
+    }
+  }
+
+  public String getDuplicateURL(String compareOrder, String url1, String url2) {
+    TestDedupReducer dedup = new TestDedupReducer();
+    dedup.setCompareOrder(compareOrder);
+    return dedup.getDuplicate(url1, url2);
+  }
+
+  @Test
+  public void testCompareURLs() {
+    // test same protocol, same length: no decision possible
+    String url0 = "https://example.com/";
+    Assert.assertNull(getDuplicateURL("httpsOverHttp,urlLength", url0, url0));
+    String url1 = "http://nutch.apache.org/";
+    String url2 = "https://nutch.apache.org/";
+    // test httpsOverHttp
+    Assert.assertEquals(url1, getDuplicateURL("httpsOverHttp", url1, url2));
+    // test urlLength
+    Assert.assertEquals(url2, getDuplicateURL("urlLength", url1, url2));
+    // test urlLength with percent-encoded URLs
+    // "b%C3%BCcher" (unescaped "bücher") is shorter than "buecher"
+    String url3 = "https://example.com/b%C3%BCcher";
+    String url4 = "https://example.com/buecher";
+    Assert.assertEquals(url4, getDuplicateURL("urlLength", url3, url4));
+    // test NUTCH-2935: should not throw error on invalid percent-encoding
+    String url5 = "https://example.com/%YR";
+    String url6 = "https://example.com/%YR%YR";
+    Assert.assertEquals(url6, getDuplicateURL("urlLength", url5, url6));
+  }
+
+}
diff --git a/src/testresources/deduplication-crawldb/current/part-r-00000/.data.crc b/src/testresources/deduplication-crawldb/current/part-r-00000/.data.crc
new file mode 100644
index 0000000..d43f0ac
Binary files /dev/null and b/src/testresources/deduplication-crawldb/current/part-r-00000/.data.crc differ
diff --git a/src/testresources/deduplication-crawldb/current/part-r-00000/.index.crc b/src/testresources/deduplication-crawldb/current/part-r-00000/.index.crc
new file mode 100644
index 0000000..9303568
Binary files /dev/null and b/src/testresources/deduplication-crawldb/current/part-r-00000/.index.crc differ
diff --git a/src/testresources/deduplication-crawldb/current/part-r-00000/data b/src/testresources/deduplication-crawldb/current/part-r-00000/data
new file mode 100644
index 0000000..640a0d5
Binary files /dev/null and b/src/testresources/deduplication-crawldb/current/part-r-00000/data differ
diff --git a/src/testresources/deduplication-crawldb/current/part-r-00000/index b/src/testresources/deduplication-crawldb/current/part-r-00000/index
new file mode 100644
index 0000000..c1be1ec
Binary files /dev/null and b/src/testresources/deduplication-crawldb/current/part-r-00000/index differ