You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2022/01/17 18:57:03 UTC
[nutch] branch master updated: NUTCH-2935 DeduplicationJob: failure on URLs with invalid percent encoding - catch IllegalArgumentException when unescaping percent-encoding in URLs - if one URL of two compared URLs is valid, keep it as non-duplicate - add unit tests for DeduplicationJob
This is an automated email from the ASF dual-hosted git repository.
snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push:
new d565f45 NUTCH-2935 DeduplicationJob: failure on URLs with invalid percent encoding - catch IllegalArgumentException when unescaping percent-encoding in URLs - if one URL of two compared URLs is valid, keep it as non-duplicate - add unit tests for DeduplicationJob
d565f45 is described below
commit d565f45a67d2491b7b536ae95560522aa20b8c26
Author: Sebastian Nagel <sn...@apache.org>
AuthorDate: Fri Jan 14 10:34:22 2022 +0100
NUTCH-2935 DeduplicationJob: failure on URLs with invalid percent encoding
- catch IllegalArgumentException when unescaping percent-encoding in URLs
- if one URL of two compared URLs is valid, keep it as non-duplicate
- add unit tests for DeduplicationJob
---
.../org/apache/nutch/crawl/DeduplicationJob.java | 60 ++++----
.../nutch/crawl/TestCrawlDbDeduplication.java | 163 +++++++++++++++++++++
.../current/part-r-00000/.data.crc | Bin 0 -> 32 bytes
.../current/part-r-00000/.index.crc | Bin 0 -> 12 bytes
.../current/part-r-00000/data | Bin 0 -> 2604 bytes
.../current/part-r-00000/index | Bin 0 -> 233 bytes
6 files changed, 196 insertions(+), 27 deletions(-)
diff --git a/src/java/org/apache/nutch/crawl/DeduplicationJob.java b/src/java/org/apache/nutch/crawl/DeduplicationJob.java
index 7751366..5f1172d 100644
--- a/src/java/org/apache/nutch/crawl/DeduplicationJob.java
+++ b/src/java/org/apache/nutch/crawl/DeduplicationJob.java
@@ -20,6 +20,7 @@ import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.lang.invoke.MethodHandles;
import java.net.URLDecoder;
+import java.nio.charset.StandardCharsets;
import java.text.SimpleDateFormat;
import java.util.HashMap;
import java.util.Map;
@@ -30,6 +31,7 @@ import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.CounterGroup;
import org.apache.hadoop.mapreduce.Job;
@@ -59,15 +61,16 @@ import org.slf4j.LoggerFactory;
* with the latest timestamp is kept. If the documents have the same timestamp
* then the one with the shortest URL is kept. The documents marked as duplicate
* can then be deleted with the command CleaningJob.
- ***/
+ */
public class DeduplicationJob extends NutchTool implements Tool {
private static final Logger LOG = LoggerFactory
.getLogger(MethodHandles.lookup().lookupClass());
- private final static Text urlKey = new Text("_URLTEMPKEY_");
- private final static String DEDUPLICATION_GROUP_MODE = "deduplication.group.mode";
- private final static String DEDUPLICATION_COMPARE_ORDER = "deduplication.compare.order";
+ protected final static Text urlKey = new Text("_URLTEMPKEY_");
+ protected final static String DEDUPLICATION_GROUP_MODE = "deduplication.group.mode";
+ protected final static String DEDUPLICATION_COMPARE_ORDER = "deduplication.compare.order";
+ protected final static String UTF_8 = StandardCharsets.UTF_8.toString();
public static class DBFilter extends
Mapper<Text, CrawlDatum, BytesWritable, CrawlDatum> {
@@ -76,13 +79,12 @@ public class DeduplicationJob extends NutchTool implements Tool {
@Override
public void setup(Mapper<Text, CrawlDatum, BytesWritable, CrawlDatum>.Context context) {
- Configuration arg0 = context.getConfiguration();
- groupMode = arg0.get(DEDUPLICATION_GROUP_MODE);
+ Configuration conf = context.getConfiguration();
+ groupMode = conf.get(DEDUPLICATION_GROUP_MODE);
}
@Override
- public void map(Text key, CrawlDatum value,
- Context context)
+ public void map(Text key, CrawlDatum value, Context context)
throws IOException, InterruptedException {
if (value.getStatus() == CrawlDatum.STATUS_DB_FETCHED
@@ -121,18 +123,19 @@ public class DeduplicationJob extends NutchTool implements Tool {
}
}
- public static class DedupReducer extends
- Reducer<BytesWritable, CrawlDatum, Text, CrawlDatum> {
+ public static class DedupReducer<K extends Writable>
+ extends Reducer<K, CrawlDatum, Text, CrawlDatum> {
- private String[] compareOrder;
+ protected String[] compareOrder;
@Override
- public void setup(Reducer<BytesWritable, CrawlDatum, Text, CrawlDatum>.Context context) {
+ public void setup(
+ Reducer<K, CrawlDatum, Text, CrawlDatum>.Context context) {
Configuration conf = context.getConfiguration();
compareOrder = conf.get(DEDUPLICATION_COMPARE_ORDER).split(",");
}
- private void writeOutAsDuplicate(CrawlDatum datum,
+ protected void writeOutAsDuplicate(CrawlDatum datum,
Context context)
throws IOException, InterruptedException {
datum.setStatus(CrawlDatum.STATUS_DB_DUPLICATE);
@@ -143,8 +146,8 @@ public class DeduplicationJob extends NutchTool implements Tool {
}
@Override
- public void reduce(BytesWritable key, Iterable<CrawlDatum> values,
- Context context) throws IOException, InterruptedException {
+ public void reduce(K key, Iterable<CrawlDatum> values, Context context)
+ throws IOException, InterruptedException {
CrawlDatum existingDoc = null;
for (CrawlDatum newDoc : values) {
@@ -164,8 +167,7 @@ public class DeduplicationJob extends NutchTool implements Tool {
}
}
- private CrawlDatum getDuplicate(CrawlDatum existingDoc, CrawlDatum newDoc)
- throws IOException {
+ protected CrawlDatum getDuplicate(CrawlDatum existingDoc, CrawlDatum newDoc) {
for (int i = 0; i < compareOrder.length; i++) {
switch (compareOrder[i]) {
case "score":
@@ -203,17 +205,21 @@ public class DeduplicationJob extends NutchTool implements Tool {
}
break;
case "urlLength":
- // same time? keep the one which has the shortest URL
- String urlExisting;
- String urlnewDoc;
+ // keep the one which has the shortest URL
+ // normalized by decoding percent-encoded sequences
+ String urlExisting = existingDoc.getMetaData().get(urlKey).toString();
+ String urlnewDoc = newDoc.getMetaData().get(urlKey).toString();
+ try {
+ urlExisting = URLDecoder.decode(urlExisting, UTF_8);
+ } catch (UnsupportedEncodingException | IllegalArgumentException e) {
+ LOG.error("Error decoding: {}", urlExisting, e);
+ // use the encoded URL
+ }
try {
- urlExisting = URLDecoder.decode(
- existingDoc.getMetaData().get(urlKey).toString(), "UTF8");
- urlnewDoc = URLDecoder
- .decode(newDoc.getMetaData().get(urlKey).toString(), "UTF8");
- } catch (UnsupportedEncodingException e) {
- LOG.error("Error decoding: " + urlKey);
- throw new IOException("UnsupportedEncodingException for " + urlKey);
+ urlnewDoc = URLDecoder.decode(urlnewDoc, UTF_8);
+ } catch (UnsupportedEncodingException | IllegalArgumentException e) {
+ LOG.error("Error decoding: {}", urlnewDoc, e);
+ // use the encoded URL
}
if (urlExisting.length() < urlnewDoc.length()) {
// mark new one as duplicate
diff --git a/src/test/org/apache/nutch/crawl/TestCrawlDbDeduplication.java b/src/test/org/apache/nutch/crawl/TestCrawlDbDeduplication.java
new file mode 100644
index 0000000..25850a9
--- /dev/null
+++ b/src/test/org/apache/nutch/crawl/TestCrawlDbDeduplication.java
@@ -0,0 +1,163 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.crawl;
+
+import java.io.File;
+import java.io.IOException;
+import java.lang.invoke.MethodHandles;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class TestCrawlDbDeduplication {
+ private static final Logger LOG = LoggerFactory
+ .getLogger(MethodHandles.lookup().lookupClass());
+
+ Configuration conf;
+ FileSystem fs;
+ String testDir;
+ Path testCrawlDb;
+ CrawlDbReader reader;
+
+ @Before
+ public void setUp() throws Exception {
+ conf = NutchConfiguration.create();
+ fs = FileSystem.get(conf);
+ testDir = "test-crawldb-" + new java.util.Random().nextInt();
+ File sampleCrawlDb = new File(System.getProperty("test.build.data", "."),
+ "deduplication-crawldb");
+ LOG.info("Copying CrawlDb {} into test directory {}", sampleCrawlDb,
+ testDir);
+ FileUtils.copyDirectory(sampleCrawlDb, new File(testDir));
+ testCrawlDb = new Path(testDir);
+ for (FileStatus s : fs.listStatus(testCrawlDb)) {
+ LOG.info("{}", s);
+ }
+ reader = new CrawlDbReader();
+ }
+
+ @After
+ public void tearDown() {
+ try {
+ if (fs.exists(testCrawlDb))
+ fs.delete(testCrawlDb, true);
+ } catch (Exception e) {
+ }
+ try {
+ reader.close();
+ } catch (Exception e) {
+ }
+ }
+
+ @Test
+ public void testDeduplication() throws Exception {
+ String[] args = new String[3];
+ args[0] = testCrawlDb.toString();
+ args[1] = "-compareOrder";
+ args[2] = "fetchTime,urlLength,score";
+ int result = ToolRunner.run(conf, new DeduplicationJob(), args);
+ Assert.assertEquals("DeduplicationJob did not succeed", 0, result);
+ String url1 = "http://nutch.apache.org/";
+ String url2 = "https://nutch.apache.org/";
+ // url1 has been fetched earlier, so it should "survive" as "db_fetched":
+ checkStatus(url1, CrawlDatum.STATUS_DB_FETCHED);
+ checkStatus(url2, CrawlDatum.STATUS_DB_DUPLICATE);
+ }
+
+ @Test
+ public void testDeduplicationHttpsOverHttp() throws Exception {
+ String[] args = new String[3];
+ args[0] = testCrawlDb.toString();
+ args[1] = "-compareOrder";
+ args[2] = "httpsOverHttp,fetchTime,urlLength,score";
+ int result = ToolRunner.run(conf, new DeduplicationJob(), args);
+ Assert.assertEquals("DeduplicationJob did not succeed", 0, result);
+ String url1 = "http://nutch.apache.org/";
+ String url2 = "https://nutch.apache.org/";
+ // url2 is https://, so it should "survive" as "db_fetched":
+ checkStatus(url1, CrawlDatum.STATUS_DB_DUPLICATE);
+ checkStatus(url2, CrawlDatum.STATUS_DB_FETCHED);
+ }
+
+ private void checkStatus(String url, byte status) throws IOException {
+ CrawlDatum datum = reader.get(testCrawlDb.toString(), url, conf);
+ Assert.assertNotNull("No CrawlDatum found in CrawlDb for " + url, datum);
+ Assert.assertEquals(
+ "Expected status for " + url + ": " + CrawlDatum.getStatusName(status),
+ status, datum.getStatus());
+ }
+
+ static class TestDedupReducer extends DeduplicationJob.DedupReducer<Text> {
+
+ void setCompareOrder(String compareOrder) {
+ this.compareOrder = compareOrder.split(",");
+ }
+
+ String getDuplicate(String one, String two) {
+ CrawlDatum d1 = new CrawlDatum();
+ d1.getMetaData().put(DeduplicationJob.urlKey, new Text(one));
+ CrawlDatum d2 = new CrawlDatum();
+ d2.getMetaData().put(DeduplicationJob.urlKey, new Text(two));
+ CrawlDatum dup = getDuplicate(d1, d2);
+ if (dup == null) {
+ return null;
+ }
+ return dup.getMetaData().get(DeduplicationJob.urlKey).toString();
+ }
+ }
+
+ public String getDuplicateURL(String compareOrder, String url1, String url2) {
+ TestDedupReducer dedup = new TestDedupReducer();
+ dedup.setCompareOrder(compareOrder);
+ return dedup.getDuplicate(url1, url2);
+ }
+
+ @Test
+ public void testCompareURLs() {
+ // test same protocol, same length: no decision possible
+ String url0 = "https://example.com/";
+ Assert.assertNull(getDuplicateURL("httpsOverHttp,urlLength", url0, url0));
+ String url1 = "http://nutch.apache.org/";
+ String url2 = "https://nutch.apache.org/";
+ // test httpsOverHttp
+ Assert.assertEquals(url1, getDuplicateURL("httpsOverHttp", url1, url2));
+ // test urlLength
+ Assert.assertEquals(url2, getDuplicateURL("urlLength", url1, url2));
+ // test urlLength with percent-encoded URLs
+ // "b%C3%BCcher" (unescaped "bücher") is shorter than "buecher"
+ String url3 = "https://example.com/b%C3%BCcher";
+ String url4 = "https://example.com/buecher";
+ Assert.assertEquals(url4, getDuplicateURL("urlLength", url3, url4));
+ // test NUTCH-2935: should not throw error on invalid percent-encoding
+ String url5 = "https://example.com/%YR";
+ String url6 = "https://example.com/%YR%YR";
+ Assert.assertEquals(url6, getDuplicateURL("urlLength", url5, url6));
+ }
+
+}
diff --git a/src/testresources/deduplication-crawldb/current/part-r-00000/.data.crc b/src/testresources/deduplication-crawldb/current/part-r-00000/.data.crc
new file mode 100644
index 0000000..d43f0ac
Binary files /dev/null and b/src/testresources/deduplication-crawldb/current/part-r-00000/.data.crc differ
diff --git a/src/testresources/deduplication-crawldb/current/part-r-00000/.index.crc b/src/testresources/deduplication-crawldb/current/part-r-00000/.index.crc
new file mode 100644
index 0000000..9303568
Binary files /dev/null and b/src/testresources/deduplication-crawldb/current/part-r-00000/.index.crc differ
diff --git a/src/testresources/deduplication-crawldb/current/part-r-00000/data b/src/testresources/deduplication-crawldb/current/part-r-00000/data
new file mode 100644
index 0000000..640a0d5
Binary files /dev/null and b/src/testresources/deduplication-crawldb/current/part-r-00000/data differ
diff --git a/src/testresources/deduplication-crawldb/current/part-r-00000/index b/src/testresources/deduplication-crawldb/current/part-r-00000/index
new file mode 100644
index 0000000..c1be1ec
Binary files /dev/null and b/src/testresources/deduplication-crawldb/current/part-r-00000/index differ