You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by th...@apache.org on 2016/07/05 22:49:37 UTC
[53/69] [abbrv] nutch git commit: Moved test sources to maven
standard directory
http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java b/nutch-plugins/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java
deleted file mode 100644
index b86181e..0000000
--- a/nutch-plugins/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java
+++ /dev/null
@@ -1,61 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.urlfilter.regex;
-
-// JDK imports
-import java.io.IOException;
-import java.io.Reader;
-
-import org.apache.nutch.net.*;
-// Nutch imports
-import org.apache.nutch.urlfilter.api.RegexURLFilterBaseTest;
-import org.junit.Assert;
-import org.junit.Test;
-
-/**
- * JUnit based test of class <code>RegexURLFilter</code>.
- *
- * @author Jérôme Charron
- */
-public class TestRegexURLFilter extends RegexURLFilterBaseTest {
-
- protected URLFilter getURLFilter(Reader rules) {
- try {
- return new RegexURLFilter(rules);
- } catch (IOException e) {
- Assert.fail(e.toString());
- return null;
- }
- }
-
- @Test
- public void test() {
- test("WholeWebCrawling");
- test("IntranetCrawling");
- bench(50, "Benchmarks");
- bench(100, "Benchmarks");
- bench(200, "Benchmarks");
- bench(400, "Benchmarks");
- bench(800, "Benchmarks");
- }
-
- @Test
- public void test1838() {
- test("nutch1838");
- }
-
-}
http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/urlfilter-suffix/src/test/java/org/apache/nutch/urlfilter/suffix/TestSuffixURLFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-suffix/src/test/java/org/apache/nutch/urlfilter/suffix/TestSuffixURLFilter.java b/nutch-plugins/urlfilter-suffix/src/test/java/org/apache/nutch/urlfilter/suffix/TestSuffixURLFilter.java
new file mode 100644
index 0000000..b09ca2f
--- /dev/null
+++ b/nutch-plugins/urlfilter-suffix/src/test/java/org/apache/nutch/urlfilter/suffix/TestSuffixURLFilter.java
@@ -0,0 +1,123 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.suffix;
+
+import java.io.IOException;
+import java.io.StringReader;
+
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+/**
+ * JUnit test for <code>SuffixURLFilter</code>.
+ *
+ * @author Andrzej Bialecki
+ */
+public class TestSuffixURLFilter {
+ private static final String suffixes = "# this is a comment\n" + "\n"
+ + ".gif\n" + ".jpg\n" + ".js\n";
+
+ private static final String[] urls = new String[] {
+ "http://www.example.com/test.gif", "http://www.example.com/TEST.GIF",
+ "http://www.example.com/test.jpg", "http://www.example.com/test.JPG",
+ "http://www.example.com/test.html", "http://www.example.com/test.HTML",
+ "http://www.example.com/test.html?q=abc.js",
+ "http://www.example.com/test.js?foo=bar&baz=bar#12333", };
+
+ private static String[] urlsModeAccept = new String[] { null, urls[1], null,
+ urls[3], urls[4], urls[5], null, urls[7] };
+
+ private static String[] urlsModeReject = new String[] { urls[0], null,
+ urls[2], null, null, null, urls[6], null };
+
+ private static String[] urlsModeAcceptIgnoreCase = new String[] { null, null,
+ null, null, urls[4], urls[5], null, urls[7] };
+
+ private static String[] urlsModeRejectIgnoreCase = new String[] { urls[0],
+ urls[1], urls[2], urls[3], null, null, urls[6], null };
+
+ private static String[] urlsModeAcceptAndPathFilter = new String[] { null,
+ urls[1], null, urls[3], urls[4], urls[5], urls[6], null };
+
+ private static String[] urlsModeAcceptAndNonPathFilter = new String[] { null,
+ urls[1], null, urls[3], urls[4], urls[5], null, urls[7] };
+
+ private SuffixURLFilter filter = null;
+
+ @Before
+ public void setUp() throws IOException {
+ filter = new SuffixURLFilter(new StringReader(suffixes));
+ }
+
+ @Test
+ public void testModeAccept() {
+ filter.setIgnoreCase(false);
+ filter.setModeAccept(true);
+ for (int i = 0; i < urls.length; i++) {
+ Assert.assertTrue(urlsModeAccept[i] == filter.filter(urls[i]));
+ }
+ }
+
+ @Test
+ public void testModeReject() {
+ filter.setIgnoreCase(false);
+ filter.setModeAccept(false);
+ for (int i = 0; i < urls.length; i++) {
+ Assert.assertTrue(urlsModeReject[i] == filter.filter(urls[i]));
+ }
+ }
+
+ @Test
+ public void testModeAcceptIgnoreCase() {
+ filter.setIgnoreCase(true);
+ filter.setModeAccept(true);
+ for (int i = 0; i < urls.length; i++) {
+ Assert.assertTrue(urlsModeAcceptIgnoreCase[i] == filter.filter(urls[i]));
+ }
+ }
+
+ @Test
+ public void testModeRejectIgnoreCase() {
+ filter.setIgnoreCase(true);
+ filter.setModeAccept(false);
+ for (int i = 0; i < urls.length; i++) {
+ Assert.assertTrue(urlsModeRejectIgnoreCase[i] == filter.filter(urls[i]));
+ }
+ }
+
+ @Test
+ public void testModeAcceptAndNonPathFilter() {
+ filter.setModeAccept(true);
+ filter.setFilterFromPath(false);
+ for (int i = 0; i < urls.length; i++) {
+ Assert.assertTrue(urlsModeAcceptAndNonPathFilter[i] == filter
+ .filter(urls[i]));
+ }
+ }
+
+ @Test
+ public void testModeAcceptAndPathFilter() {
+ filter.setModeAccept(true);
+ filter.setFilterFromPath(true);
+ for (int i = 0; i < urls.length; i++) {
+ Assert.assertTrue(urlsModeAcceptAndPathFilter[i] == filter
+ .filter(urls[i]));
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/urlfilter-suffix/src/test/org/apache/nutch/urlfilter/suffix/TestSuffixURLFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-suffix/src/test/org/apache/nutch/urlfilter/suffix/TestSuffixURLFilter.java b/nutch-plugins/urlfilter-suffix/src/test/org/apache/nutch/urlfilter/suffix/TestSuffixURLFilter.java
deleted file mode 100644
index b09ca2f..0000000
--- a/nutch-plugins/urlfilter-suffix/src/test/org/apache/nutch/urlfilter/suffix/TestSuffixURLFilter.java
+++ /dev/null
@@ -1,123 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.urlfilter.suffix;
-
-import java.io.IOException;
-import java.io.StringReader;
-
-import org.junit.Assert;
-import org.junit.Before;
-import org.junit.Test;
-
-/**
- * JUnit test for <code>SuffixURLFilter</code>.
- *
- * @author Andrzej Bialecki
- */
-public class TestSuffixURLFilter {
- private static final String suffixes = "# this is a comment\n" + "\n"
- + ".gif\n" + ".jpg\n" + ".js\n";
-
- private static final String[] urls = new String[] {
- "http://www.example.com/test.gif", "http://www.example.com/TEST.GIF",
- "http://www.example.com/test.jpg", "http://www.example.com/test.JPG",
- "http://www.example.com/test.html", "http://www.example.com/test.HTML",
- "http://www.example.com/test.html?q=abc.js",
- "http://www.example.com/test.js?foo=bar&baz=bar#12333", };
-
- private static String[] urlsModeAccept = new String[] { null, urls[1], null,
- urls[3], urls[4], urls[5], null, urls[7] };
-
- private static String[] urlsModeReject = new String[] { urls[0], null,
- urls[2], null, null, null, urls[6], null };
-
- private static String[] urlsModeAcceptIgnoreCase = new String[] { null, null,
- null, null, urls[4], urls[5], null, urls[7] };
-
- private static String[] urlsModeRejectIgnoreCase = new String[] { urls[0],
- urls[1], urls[2], urls[3], null, null, urls[6], null };
-
- private static String[] urlsModeAcceptAndPathFilter = new String[] { null,
- urls[1], null, urls[3], urls[4], urls[5], urls[6], null };
-
- private static String[] urlsModeAcceptAndNonPathFilter = new String[] { null,
- urls[1], null, urls[3], urls[4], urls[5], null, urls[7] };
-
- private SuffixURLFilter filter = null;
-
- @Before
- public void setUp() throws IOException {
- filter = new SuffixURLFilter(new StringReader(suffixes));
- }
-
- @Test
- public void testModeAccept() {
- filter.setIgnoreCase(false);
- filter.setModeAccept(true);
- for (int i = 0; i < urls.length; i++) {
- Assert.assertTrue(urlsModeAccept[i] == filter.filter(urls[i]));
- }
- }
-
- @Test
- public void testModeReject() {
- filter.setIgnoreCase(false);
- filter.setModeAccept(false);
- for (int i = 0; i < urls.length; i++) {
- Assert.assertTrue(urlsModeReject[i] == filter.filter(urls[i]));
- }
- }
-
- @Test
- public void testModeAcceptIgnoreCase() {
- filter.setIgnoreCase(true);
- filter.setModeAccept(true);
- for (int i = 0; i < urls.length; i++) {
- Assert.assertTrue(urlsModeAcceptIgnoreCase[i] == filter.filter(urls[i]));
- }
- }
-
- @Test
- public void testModeRejectIgnoreCase() {
- filter.setIgnoreCase(true);
- filter.setModeAccept(false);
- for (int i = 0; i < urls.length; i++) {
- Assert.assertTrue(urlsModeRejectIgnoreCase[i] == filter.filter(urls[i]));
- }
- }
-
- @Test
- public void testModeAcceptAndNonPathFilter() {
- filter.setModeAccept(true);
- filter.setFilterFromPath(false);
- for (int i = 0; i < urls.length; i++) {
- Assert.assertTrue(urlsModeAcceptAndNonPathFilter[i] == filter
- .filter(urls[i]));
- }
- }
-
- @Test
- public void testModeAcceptAndPathFilter() {
- filter.setModeAccept(true);
- filter.setFilterFromPath(true);
- for (int i = 0; i < urls.length; i++) {
- Assert.assertTrue(urlsModeAcceptAndPathFilter[i] == filter
- .filter(urls[i]));
- }
- }
-
-}
http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/urlfilter-validator/src/test/java/org/apache/nutch/urlfilter/validator/TestUrlValidator.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-validator/src/test/java/org/apache/nutch/urlfilter/validator/TestUrlValidator.java b/nutch-plugins/urlfilter-validator/src/test/java/org/apache/nutch/urlfilter/validator/TestUrlValidator.java
new file mode 100644
index 0000000..2e6d695
--- /dev/null
+++ b/nutch-plugins/urlfilter-validator/src/test/java/org/apache/nutch/urlfilter/validator/TestUrlValidator.java
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.validator;
+
+import org.apache.nutch.urlfilter.validator.UrlValidator;
+import org.junit.Assert;
+import org.junit.Test;
+
+/**
+ * JUnit test case which tests 1. that valid urls are not filtered while invalid
+ * ones are filtered. 2. that Urls' scheme, authority, path and query are
+ * validated.
+ *
+ * @author tejasp
+ *
+ */
+
+public class TestUrlValidator {
+
+ /**
+ * Test method for
+ * {@link org.apache.nutch.urlfilter.validator.UrlValidator#filter(java.lang.String)}
+ * .
+ */
+ @Test
+ public void testFilter() {
+ UrlValidator url_validator = new UrlValidator();
+ Assert.assertNotNull(url_validator);
+
+ Assert.assertNull("Filtering on a null object should return null",
+ url_validator.filter(null));
+ Assert.assertNull("Invalid url: example.com/file[/].html",
+ url_validator.filter("example.com/file[/].html"));
+ Assert.assertNull("Invalid url: http://www.example.com/space here.html",
+ url_validator.filter("http://www.example.com/space here.html"));
+ Assert.assertNull("Invalid url: /main.html",
+ url_validator.filter("/main.html"));
+ Assert.assertNull("Invalid url: www.example.com/main.html",
+ url_validator.filter("www.example.com/main.html"));
+ Assert.assertNull("Invalid url: ftp:www.example.com/main.html",
+ url_validator.filter("ftp:www.example.com/main.html"));
+ Assert.assertNull(
+ "Inalid url: http://999.000.456.32/nutch/trunk/README.txt",
+ url_validator.filter("http://999.000.456.32/nutch/trunk/README.txt"));
+ Assert.assertNull("Invalid url: http://www.example.com/ma|in\\toc.html",
+ url_validator.filter(" http://www.example.com/ma|in\\toc.html"));
+
+ Assert.assertNotNull(
+ "Valid url: https://issues.apache.org/jira/NUTCH-1127",
+ url_validator.filter("https://issues.apache.org/jira/NUTCH-1127"));
+ Assert
+ .assertNotNull(
+ "Valid url: http://domain.tld/function.cgi?url=http://fonzi.com/&name=Fonzi&mood=happy&coat=leather",
+ url_validator
+ .filter("http://domain.tld/function.cgi?url=http://fonzi.com/&name=Fonzi&mood=happy&coat=leather"));
+ Assert
+ .assertNotNull(
+ "Valid url: http://validator.w3.org/feed/check.cgi?url=http%3A%2F%2Ffeeds.feedburner.com%2Fperishablepress",
+ url_validator
+ .filter("http://validator.w3.org/feed/check.cgi?url=http%3A%2F%2Ffeeds.feedburner.com%2Fperishablepress"));
+ Assert.assertNotNull("Valid url: ftp://alfa.bravo.pi/foo/bar/plan.pdf",
+ url_validator.filter("ftp://alfa.bravo.pi/mike/check/plan.pdf"));
+
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/urlfilter-validator/src/test/org/apache/nutch/urlfilter/validator/TestUrlValidator.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-validator/src/test/org/apache/nutch/urlfilter/validator/TestUrlValidator.java b/nutch-plugins/urlfilter-validator/src/test/org/apache/nutch/urlfilter/validator/TestUrlValidator.java
deleted file mode 100644
index 2e6d695..0000000
--- a/nutch-plugins/urlfilter-validator/src/test/org/apache/nutch/urlfilter/validator/TestUrlValidator.java
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.urlfilter.validator;
-
-import org.apache.nutch.urlfilter.validator.UrlValidator;
-import org.junit.Assert;
-import org.junit.Test;
-
-/**
- * JUnit test case which tests 1. that valid urls are not filtered while invalid
- * ones are filtered. 2. that Urls' scheme, authority, path and query are
- * validated.
- *
- * @author tejasp
- *
- */
-
-public class TestUrlValidator {
-
- /**
- * Test method for
- * {@link org.apache.nutch.urlfilter.validator.UrlValidator#filter(java.lang.String)}
- * .
- */
- @Test
- public void testFilter() {
- UrlValidator url_validator = new UrlValidator();
- Assert.assertNotNull(url_validator);
-
- Assert.assertNull("Filtering on a null object should return null",
- url_validator.filter(null));
- Assert.assertNull("Invalid url: example.com/file[/].html",
- url_validator.filter("example.com/file[/].html"));
- Assert.assertNull("Invalid url: http://www.example.com/space here.html",
- url_validator.filter("http://www.example.com/space here.html"));
- Assert.assertNull("Invalid url: /main.html",
- url_validator.filter("/main.html"));
- Assert.assertNull("Invalid url: www.example.com/main.html",
- url_validator.filter("www.example.com/main.html"));
- Assert.assertNull("Invalid url: ftp:www.example.com/main.html",
- url_validator.filter("ftp:www.example.com/main.html"));
- Assert.assertNull(
- "Inalid url: http://999.000.456.32/nutch/trunk/README.txt",
- url_validator.filter("http://999.000.456.32/nutch/trunk/README.txt"));
- Assert.assertNull("Invalid url: http://www.example.com/ma|in\\toc.html",
- url_validator.filter(" http://www.example.com/ma|in\\toc.html"));
-
- Assert.assertNotNull(
- "Valid url: https://issues.apache.org/jira/NUTCH-1127",
- url_validator.filter("https://issues.apache.org/jira/NUTCH-1127"));
- Assert
- .assertNotNull(
- "Valid url: http://domain.tld/function.cgi?url=http://fonzi.com/&name=Fonzi&mood=happy&coat=leather",
- url_validator
- .filter("http://domain.tld/function.cgi?url=http://fonzi.com/&name=Fonzi&mood=happy&coat=leather"));
- Assert
- .assertNotNull(
- "Valid url: http://validator.w3.org/feed/check.cgi?url=http%3A%2F%2Ffeeds.feedburner.com%2Fperishablepress",
- url_validator
- .filter("http://validator.w3.org/feed/check.cgi?url=http%3A%2F%2Ffeeds.feedburner.com%2Fperishablepress"));
- Assert.assertNotNull("Valid url: ftp://alfa.bravo.pi/foo/bar/plan.pdf",
- url_validator.filter("ftp://alfa.bravo.pi/mike/check/plan.pdf"));
-
- }
-}
http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/urlnormalizer-ajax/src/test/java/org/apache/nutch/net/urlnormalizer/ajax/TestAjaxURLNormalizer.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlnormalizer-ajax/src/test/java/org/apache/nutch/net/urlnormalizer/ajax/TestAjaxURLNormalizer.java b/nutch-plugins/urlnormalizer-ajax/src/test/java/org/apache/nutch/net/urlnormalizer/ajax/TestAjaxURLNormalizer.java
new file mode 100644
index 0000000..d815c45
--- /dev/null
+++ b/nutch-plugins/urlnormalizer-ajax/src/test/java/org/apache/nutch/net/urlnormalizer/ajax/TestAjaxURLNormalizer.java
@@ -0,0 +1,67 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.net.urlnormalizer.ajax;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.net.URLNormalizers;
+import org.apache.nutch.util.NutchConfiguration;
+
+import junit.framework.TestCase;
+
+/** Unit tests for AjaxURLNormalizer. */
+public class TestAjaxURLNormalizer extends TestCase {
+ private AjaxURLNormalizer normalizer;
+ private Configuration conf;
+
+ public TestAjaxURLNormalizer(String name) {
+ super(name);
+ normalizer = new AjaxURLNormalizer();
+ conf = NutchConfiguration.create();
+ normalizer.setConf(conf);
+ }
+
+ public void testNormalizer() throws Exception {
+ // check if AJAX URL's are normalized to an _escaped_frament_ form
+ normalizeTest("http://example.org/#!k=v", "http://example.org/?_escaped_fragment_=k=v");
+
+ // Check with some escaped chars
+ normalizeTest("http://example.org/#!k=v&something=is wrong", "http://example.org/?_escaped_fragment_=k=v%26something=is%20wrong");
+
+ // Check with query string and multiple fragment params
+ normalizeTest("http://example.org/path.html?queryparam=queryvalue#!key1=value1&key2=value2", "http://example.org/path.html?queryparam=queryvalue&_escaped_fragment_=key1=value1%26key2=value2");
+ }
+
+ public void testNormalizerWhenIndexing() throws Exception {
+ // check if it works the other way around
+ normalizeTest("http://example.org/?_escaped_fragment_=key=value", "http://example.org/#!key=value", URLNormalizers.SCOPE_INDEXER);
+ normalizeTest("http://example.org/?key=value&_escaped_fragment_=key=value", "http://example.org/?key=value#!key=value", URLNormalizers.SCOPE_INDEXER);
+ normalizeTest("http://example.org/page.html?key=value&_escaped_fragment_=key=value%26something=is%20wrong", "http://example.org/page.html?key=value#!key=value&something=is wrong", URLNormalizers.SCOPE_INDEXER);
+ }
+
+ private void normalizeTest(String weird, String normal) throws Exception {
+ assertEquals(normal, normalizer.normalize(weird, URLNormalizers.SCOPE_DEFAULT));
+ }
+
+ private void normalizeTest(String weird, String normal, String scope) throws Exception {
+ assertEquals(normal, normalizer.normalize(weird, scope));
+ }
+
+ public static void main(String[] args) throws Exception {
+ new TestAjaxURLNormalizer("test").testNormalizer();
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/urlnormalizer-ajax/src/test/org/apache/nutch/net/urlnormalizer/ajax/TestAjaxURLNormalizer.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlnormalizer-ajax/src/test/org/apache/nutch/net/urlnormalizer/ajax/TestAjaxURLNormalizer.java b/nutch-plugins/urlnormalizer-ajax/src/test/org/apache/nutch/net/urlnormalizer/ajax/TestAjaxURLNormalizer.java
deleted file mode 100644
index d815c45..0000000
--- a/nutch-plugins/urlnormalizer-ajax/src/test/org/apache/nutch/net/urlnormalizer/ajax/TestAjaxURLNormalizer.java
+++ /dev/null
@@ -1,67 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.net.urlnormalizer.ajax;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.net.URLNormalizers;
-import org.apache.nutch.util.NutchConfiguration;
-
-import junit.framework.TestCase;
-
-/** Unit tests for AjaxURLNormalizer. */
-public class TestAjaxURLNormalizer extends TestCase {
- private AjaxURLNormalizer normalizer;
- private Configuration conf;
-
- public TestAjaxURLNormalizer(String name) {
- super(name);
- normalizer = new AjaxURLNormalizer();
- conf = NutchConfiguration.create();
- normalizer.setConf(conf);
- }
-
- public void testNormalizer() throws Exception {
- // check if AJAX URL's are normalized to an _escaped_frament_ form
- normalizeTest("http://example.org/#!k=v", "http://example.org/?_escaped_fragment_=k=v");
-
- // Check with some escaped chars
- normalizeTest("http://example.org/#!k=v&something=is wrong", "http://example.org/?_escaped_fragment_=k=v%26something=is%20wrong");
-
- // Check with query string and multiple fragment params
- normalizeTest("http://example.org/path.html?queryparam=queryvalue#!key1=value1&key2=value2", "http://example.org/path.html?queryparam=queryvalue&_escaped_fragment_=key1=value1%26key2=value2");
- }
-
- public void testNormalizerWhenIndexing() throws Exception {
- // check if it works the other way around
- normalizeTest("http://example.org/?_escaped_fragment_=key=value", "http://example.org/#!key=value", URLNormalizers.SCOPE_INDEXER);
- normalizeTest("http://example.org/?key=value&_escaped_fragment_=key=value", "http://example.org/?key=value#!key=value", URLNormalizers.SCOPE_INDEXER);
- normalizeTest("http://example.org/page.html?key=value&_escaped_fragment_=key=value%26something=is%20wrong", "http://example.org/page.html?key=value#!key=value&something=is wrong", URLNormalizers.SCOPE_INDEXER);
- }
-
- private void normalizeTest(String weird, String normal) throws Exception {
- assertEquals(normal, normalizer.normalize(weird, URLNormalizers.SCOPE_DEFAULT));
- }
-
- private void normalizeTest(String weird, String normal, String scope) throws Exception {
- assertEquals(normal, normalizer.normalize(weird, scope));
- }
-
- public static void main(String[] args) throws Exception {
- new TestAjaxURLNormalizer("test").testNormalizer();
- }
-}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/urlnormalizer-basic/src/test/java/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlnormalizer-basic/src/test/java/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java b/nutch-plugins/urlnormalizer-basic/src/test/java/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
new file mode 100644
index 0000000..9a0f8c4
--- /dev/null
+++ b/nutch-plugins/urlnormalizer-basic/src/test/java/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
@@ -0,0 +1,175 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.net.urlnormalizer.basic;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.net.URLNormalizers;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Test;
+
+/** Unit tests for BasicURLNormalizer. */
+public class TestBasicURLNormalizer {
+ private BasicURLNormalizer normalizer;
+
+ private Configuration conf;
+
+ public TestBasicURLNormalizer() {
+ normalizer = new BasicURLNormalizer();
+ conf = NutchConfiguration.create();
+ normalizer.setConf(conf);
+ }
+
+ @Test
+ public void testNUTCH1098() throws Exception {
+ // check that % encoding is normalized
+ normalizeTest("http://foo.com/%66oo.html", "http://foo.com/foo.html");
+
+ // check that % encoding works correctly at end of URL
+ normalizeTest("http://foo.com/%66oo.htm%6c", "http://foo.com/foo.html");
+ normalizeTest("http://foo.com/%66oo.ht%6dl", "http://foo.com/foo.html");
+
+ // check that % decoder do not overlap strings
+ normalizeTest("http://foo.com/%66oo.ht%6d%6c", "http://foo.com/foo.html");
+
+ // check that % decoder leaves high bit chars alone
+ normalizeTest("http://foo.com/%66oo.htm%C0", "http://foo.com/foo.htm%C0");
+
+ // check that % decoder leaves control chars alone
+ normalizeTest("http://foo.com/%66oo.htm%1A", "http://foo.com/foo.htm%1A");
+
+ // check that % decoder converts to upper case letters
+ normalizeTest("http://foo.com/%66oo.htm%c0", "http://foo.com/foo.htm%C0");
+
+ // check that % decoder leaves encoded spaces alone
+ normalizeTest("http://foo.com/you%20too.html", "http://foo.com/you%20too.html");
+
+ // check that spaces are encoded into %20
+ normalizeTest("http://foo.com/you too.html", "http://foo.com/you%20too.html");
+
+ // check that encoded # are not decoded
+ normalizeTest("http://foo.com/file.html%23cz", "http://foo.com/file.html%23cz");
+
+ // check that encoded / are not decoded
+ normalizeTest("http://foo.com/fast/dir%2fcz", "http://foo.com/fast/dir%2Fcz");
+
+ // check that control chars are encoded
+ normalizeTest("http://foo.com/\u001a!", "http://foo.com/%1A!");
+
+ // check that control chars are always encoded into 2 digits
+ normalizeTest("http://foo.com/\u0001!", "http://foo.com/%01!");
+
+ // check encoding of spanish chars
+ normalizeTest("http://mydomain.com/en Espa\u00F1ol.aspx", "http://mydomain.com/en%20Espa%C3%B1ol.aspx");
+ }
+
+ @Test
+ public void testNUTCH2064() throws Exception {
+ // Ampersand and colon and other punctuation characters are not to be unescaped
+ normalizeTest("http://x.com/s?q=a%26b&m=10", "http://x.com/s?q=a%26b&m=10");
+ normalizeTest("http://x.com/show?http%3A%2F%2Fx.com%2Fb",
+ "http://x.com/show?http%3A%2F%2Fx.com%2Fb");
+ normalizeTest("http://google.com/search?q=c%2B%2B",
+ "http://google.com/search?q=c%2B%2B");
+ // do also not touch the query part which is application/x-www-form-urlencoded
+ normalizeTest("http://x.com/s?q=a+b", "http://x.com/s?q=a+b");
+ // and keep Internationalized domain names
+ // http://b�cher.de/ may be http://xn--bcher-kva.de/
+ // but definitely not http://b%C3%BCcher.de/
+ normalizeTest("http://b\u00fccher.de/", "http://b\u00fccher.de/");
+ // test whether percent-encoding works together with other normalizations
+ normalizeTest("http://x.com/./a/../%66.html", "http://x.com/f.html");
+ // [ and ] need escaping as well
+ normalizeTest("http://x.com/?x[y]=1", "http://x.com/?x%5By%5D=1");
+ // boundary test for first character outside the ASCII range (U+0080)
+ normalizeTest("http://x.com/foo\u0080", "http://x.com/foo%C2%80");
+ normalizeTest("http://x.com/foo%c2%80", "http://x.com/foo%C2%80");
+ }
+
+ @Test
+ public void testNormalizer() throws Exception {
+ // check that leading and trailing spaces are removed
+ normalizeTest(" http://foo.com/ ", "http://foo.com/");
+
+ // check that protocol is lower cased
+ normalizeTest("HTTP://foo.com/", "http://foo.com/");
+
+ // check that host is lower cased
+ normalizeTest("http://Foo.Com/index.html", "http://foo.com/index.html");
+ normalizeTest("http://Foo.Com/index.html", "http://foo.com/index.html");
+
+ // check that port number is normalized
+ normalizeTest("http://foo.com:80/index.html", "http://foo.com/index.html");
+ normalizeTest("http://foo.com:81/", "http://foo.com:81/");
+
+ // check that null path is normalized
+ normalizeTest("http://foo.com", "http://foo.com/");
+
+ // check that references are removed
+ normalizeTest("http://foo.com/foo.html#ref", "http://foo.com/foo.html");
+
+ // // check that encoding is normalized
+ // normalizeTest("http://foo.com/%66oo.html", "http://foo.com/foo.html");
+
+ // check that unnecessary "../" are removed
+
+ normalizeTest("http://foo.com/aa/./foo.html", "http://foo.com/aa/foo.html");
+ normalizeTest("http://foo.com/aa/../", "http://foo.com/");
+ normalizeTest("http://foo.com/aa/bb/../", "http://foo.com/aa/");
+ normalizeTest("http://foo.com/aa/..", "http://foo.com/");
+ normalizeTest("http://foo.com/aa/bb/cc/../../foo.html",
+ "http://foo.com/aa/foo.html");
+ normalizeTest("http://foo.com/aa/bb/../cc/dd/../ee/foo.html",
+ "http://foo.com/aa/cc/ee/foo.html");
+ normalizeTest("http://foo.com/../foo.html", "http://foo.com/foo.html");
+ normalizeTest("http://foo.com/../../foo.html", "http://foo.com/foo.html");
+ normalizeTest("http://foo.com/../aa/../foo.html", "http://foo.com/foo.html");
+ normalizeTest("http://foo.com/aa/../../foo.html", "http://foo.com/foo.html");
+ normalizeTest("http://foo.com/aa/../bb/../foo.html/../../",
+ "http://foo.com/");
+ normalizeTest("http://foo.com/../aa/foo.html", "http://foo.com/aa/foo.html");
+ normalizeTest("http://foo.com/../aa/../foo.html", "http://foo.com/foo.html");
+ normalizeTest("http://foo.com/a..a/foo.html",
+ "http://foo.com/a..a/foo.html");
+ normalizeTest("http://foo.com/a..a/../foo.html", "http://foo.com/foo.html");
+ normalizeTest("http://foo.com/foo.foo/../foo.html",
+ "http://foo.com/foo.html");
+ normalizeTest("http://foo.com//aa/bb/foo.html",
+ "http://foo.com/aa/bb/foo.html");
+ normalizeTest("http://foo.com/aa//bb/foo.html",
+ "http://foo.com/aa/bb/foo.html");
+ normalizeTest("http://foo.com/aa/bb//foo.html",
+ "http://foo.com/aa/bb/foo.html");
+ normalizeTest("http://foo.com//aa//bb//foo.html",
+ "http://foo.com/aa/bb/foo.html");
+ normalizeTest("http://foo.com////aa////bb////foo.html",
+ "http://foo.com/aa/bb/foo.html");
+ normalizeTest("http://foo.com/aa?referer=http://bar.com",
+ "http://foo.com/aa?referer=http://bar.com");
+ }
+
+ private void normalizeTest(String weird, String normal) throws Exception {
+ Assert.assertEquals("normalizing: " + weird, normal,
+ normalizer.normalize(weird, URLNormalizers.SCOPE_DEFAULT));
+ }
+
+ public static void main(String[] args) throws Exception {
+ new TestBasicURLNormalizer().testNormalizer();
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java b/nutch-plugins/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
deleted file mode 100644
index 9a0f8c4..0000000
--- a/nutch-plugins/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
+++ /dev/null
@@ -1,175 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.net.urlnormalizer.basic;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.net.URLNormalizers;
-import org.apache.nutch.util.NutchConfiguration;
-import org.junit.Assert;
-import org.junit.Test;
-
-/** Unit tests for BasicURLNormalizer. */
-public class TestBasicURLNormalizer {
- private BasicURLNormalizer normalizer;
-
- private Configuration conf;
-
- public TestBasicURLNormalizer() {
- normalizer = new BasicURLNormalizer();
- conf = NutchConfiguration.create();
- normalizer.setConf(conf);
- }
-
- @Test
- public void testNUTCH1098() throws Exception {
- // check that % encoding is normalized
- normalizeTest("http://foo.com/%66oo.html", "http://foo.com/foo.html");
-
- // check that % encoding works correctly at end of URL
- normalizeTest("http://foo.com/%66oo.htm%6c", "http://foo.com/foo.html");
- normalizeTest("http://foo.com/%66oo.ht%6dl", "http://foo.com/foo.html");
-
- // check that % decoder do not overlap strings
- normalizeTest("http://foo.com/%66oo.ht%6d%6c", "http://foo.com/foo.html");
-
- // check that % decoder leaves high bit chars alone
- normalizeTest("http://foo.com/%66oo.htm%C0", "http://foo.com/foo.htm%C0");
-
- // check that % decoder leaves control chars alone
- normalizeTest("http://foo.com/%66oo.htm%1A", "http://foo.com/foo.htm%1A");
-
- // check that % decoder converts to upper case letters
- normalizeTest("http://foo.com/%66oo.htm%c0", "http://foo.com/foo.htm%C0");
-
- // check that % decoder leaves encoded spaces alone
- normalizeTest("http://foo.com/you%20too.html", "http://foo.com/you%20too.html");
-
- // check that spaces are encoded into %20
- normalizeTest("http://foo.com/you too.html", "http://foo.com/you%20too.html");
-
- // check that encoded # are not decoded
- normalizeTest("http://foo.com/file.html%23cz", "http://foo.com/file.html%23cz");
-
- // check that encoded / are not decoded
- normalizeTest("http://foo.com/fast/dir%2fcz", "http://foo.com/fast/dir%2Fcz");
-
- // check that control chars are encoded
- normalizeTest("http://foo.com/\u001a!", "http://foo.com/%1A!");
-
- // check that control chars are always encoded into 2 digits
- normalizeTest("http://foo.com/\u0001!", "http://foo.com/%01!");
-
- // check encoding of spanish chars
- normalizeTest("http://mydomain.com/en Espa\u00F1ol.aspx", "http://mydomain.com/en%20Espa%C3%B1ol.aspx");
- }
-
- @Test
- public void testNUTCH2064() throws Exception {
- // Ampersand and colon and other punctuation characters are not to be unescaped
- normalizeTest("http://x.com/s?q=a%26b&m=10", "http://x.com/s?q=a%26b&m=10");
- normalizeTest("http://x.com/show?http%3A%2F%2Fx.com%2Fb",
- "http://x.com/show?http%3A%2F%2Fx.com%2Fb");
- normalizeTest("http://google.com/search?q=c%2B%2B",
- "http://google.com/search?q=c%2B%2B");
- // do also not touch the query part which is application/x-www-form-urlencoded
- normalizeTest("http://x.com/s?q=a+b", "http://x.com/s?q=a+b");
- // and keep Internationalized domain names
- // http://b�cher.de/ may be http://xn--bcher-kva.de/
- // but definitely not http://b%C3%BCcher.de/
- normalizeTest("http://b\u00fccher.de/", "http://b\u00fccher.de/");
- // test whether percent-encoding works together with other normalizations
- normalizeTest("http://x.com/./a/../%66.html", "http://x.com/f.html");
- // [ and ] need escaping as well
- normalizeTest("http://x.com/?x[y]=1", "http://x.com/?x%5By%5D=1");
- // boundary test for first character outside the ASCII range (U+0080)
- normalizeTest("http://x.com/foo\u0080", "http://x.com/foo%C2%80");
- normalizeTest("http://x.com/foo%c2%80", "http://x.com/foo%C2%80");
- }
-
- @Test
- public void testNormalizer() throws Exception {
- // check that leading and trailing spaces are removed
- normalizeTest(" http://foo.com/ ", "http://foo.com/");
-
- // check that protocol is lower cased
- normalizeTest("HTTP://foo.com/", "http://foo.com/");
-
- // check that host is lower cased
- normalizeTest("http://Foo.Com/index.html", "http://foo.com/index.html");
- normalizeTest("http://Foo.Com/index.html", "http://foo.com/index.html");
-
- // check that port number is normalized
- normalizeTest("http://foo.com:80/index.html", "http://foo.com/index.html");
- normalizeTest("http://foo.com:81/", "http://foo.com:81/");
-
- // check that null path is normalized
- normalizeTest("http://foo.com", "http://foo.com/");
-
- // check that references are removed
- normalizeTest("http://foo.com/foo.html#ref", "http://foo.com/foo.html");
-
- // // check that encoding is normalized
- // normalizeTest("http://foo.com/%66oo.html", "http://foo.com/foo.html");
-
- // check that unnecessary "../" are removed
-
- normalizeTest("http://foo.com/aa/./foo.html", "http://foo.com/aa/foo.html");
- normalizeTest("http://foo.com/aa/../", "http://foo.com/");
- normalizeTest("http://foo.com/aa/bb/../", "http://foo.com/aa/");
- normalizeTest("http://foo.com/aa/..", "http://foo.com/");
- normalizeTest("http://foo.com/aa/bb/cc/../../foo.html",
- "http://foo.com/aa/foo.html");
- normalizeTest("http://foo.com/aa/bb/../cc/dd/../ee/foo.html",
- "http://foo.com/aa/cc/ee/foo.html");
- normalizeTest("http://foo.com/../foo.html", "http://foo.com/foo.html");
- normalizeTest("http://foo.com/../../foo.html", "http://foo.com/foo.html");
- normalizeTest("http://foo.com/../aa/../foo.html", "http://foo.com/foo.html");
- normalizeTest("http://foo.com/aa/../../foo.html", "http://foo.com/foo.html");
- normalizeTest("http://foo.com/aa/../bb/../foo.html/../../",
- "http://foo.com/");
- normalizeTest("http://foo.com/../aa/foo.html", "http://foo.com/aa/foo.html");
- normalizeTest("http://foo.com/../aa/../foo.html", "http://foo.com/foo.html");
- normalizeTest("http://foo.com/a..a/foo.html",
- "http://foo.com/a..a/foo.html");
- normalizeTest("http://foo.com/a..a/../foo.html", "http://foo.com/foo.html");
- normalizeTest("http://foo.com/foo.foo/../foo.html",
- "http://foo.com/foo.html");
- normalizeTest("http://foo.com//aa/bb/foo.html",
- "http://foo.com/aa/bb/foo.html");
- normalizeTest("http://foo.com/aa//bb/foo.html",
- "http://foo.com/aa/bb/foo.html");
- normalizeTest("http://foo.com/aa/bb//foo.html",
- "http://foo.com/aa/bb/foo.html");
- normalizeTest("http://foo.com//aa//bb//foo.html",
- "http://foo.com/aa/bb/foo.html");
- normalizeTest("http://foo.com////aa////bb////foo.html",
- "http://foo.com/aa/bb/foo.html");
- normalizeTest("http://foo.com/aa?referer=http://bar.com",
- "http://foo.com/aa?referer=http://bar.com");
- }
-
- private void normalizeTest(String weird, String normal) throws Exception {
- Assert.assertEquals("normalizing: " + weird, normal,
- normalizer.normalize(weird, URLNormalizers.SCOPE_DEFAULT));
- }
-
- public static void main(String[] args) throws Exception {
- new TestBasicURLNormalizer().testNormalizer();
- }
-
-}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/urlnormalizer-host/src/test/java/org/apache/nutch/net/urlnormalizer/host/TestHostURLNormalizer.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlnormalizer-host/src/test/java/org/apache/nutch/net/urlnormalizer/host/TestHostURLNormalizer.java b/nutch-plugins/urlnormalizer-host/src/test/java/org/apache/nutch/net/urlnormalizer/host/TestHostURLNormalizer.java
new file mode 100644
index 0000000..c9e1a2c
--- /dev/null
+++ b/nutch-plugins/urlnormalizer-host/src/test/java/org/apache/nutch/net/urlnormalizer/host/TestHostURLNormalizer.java
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.net.urlnormalizer.host;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.net.URLNormalizers;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Test;
+
+public class TestHostURLNormalizer {
+
+ private final static String SEPARATOR = System.getProperty("file.separator");
+ private final static String SAMPLES = System.getProperty("test.data", ".");
+
+ @Test
+ public void testHostURLNormalizer() throws Exception {
+ Configuration conf = NutchConfiguration.create();
+
+ String hostsFile = SAMPLES + SEPARATOR + "hosts.txt";
+ HostURLNormalizer normalizer = new HostURLNormalizer(hostsFile);
+ normalizer.setConf(conf);
+
+ // Force www. sub domain when hitting link without sub domain
+ Assert.assertEquals("http://www.example.org/page.html",
+ normalizer.normalize("http://example.org/page.html",
+ URLNormalizers.SCOPE_DEFAULT));
+
+ // Force no sub domain to www. URL's
+ Assert.assertEquals("http://example.net/path/to/something.html", normalizer
+ .normalize("http://www.example.net/path/to/something.html",
+ URLNormalizers.SCOPE_DEFAULT));
+
+ // Force all sub domains to www.
+ Assert.assertEquals("http://example.com/?does=it&still=work", normalizer
+ .normalize("http://example.com/?does=it&still=work",
+ URLNormalizers.SCOPE_DEFAULT));
+ Assert.assertEquals("http://example.com/buh", normalizer.normalize(
+ "http://http.www.example.com/buh", URLNormalizers.SCOPE_DEFAULT));
+ Assert.assertEquals("http://example.com/blaat", normalizer.normalize(
+ "http://whatever.example.com/blaat", URLNormalizers.SCOPE_DEFAULT));
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/urlnormalizer-host/src/test/org/apache/nutch/net/urlnormalizer/host/TestHostURLNormalizer.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlnormalizer-host/src/test/org/apache/nutch/net/urlnormalizer/host/TestHostURLNormalizer.java b/nutch-plugins/urlnormalizer-host/src/test/org/apache/nutch/net/urlnormalizer/host/TestHostURLNormalizer.java
deleted file mode 100644
index c9e1a2c..0000000
--- a/nutch-plugins/urlnormalizer-host/src/test/org/apache/nutch/net/urlnormalizer/host/TestHostURLNormalizer.java
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.net.urlnormalizer.host;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.net.URLNormalizers;
-import org.apache.nutch.util.NutchConfiguration;
-import org.junit.Assert;
-import org.junit.Test;
-
-public class TestHostURLNormalizer {
-
- private final static String SEPARATOR = System.getProperty("file.separator");
- private final static String SAMPLES = System.getProperty("test.data", ".");
-
- @Test
- public void testHostURLNormalizer() throws Exception {
- Configuration conf = NutchConfiguration.create();
-
- String hostsFile = SAMPLES + SEPARATOR + "hosts.txt";
- HostURLNormalizer normalizer = new HostURLNormalizer(hostsFile);
- normalizer.setConf(conf);
-
- // Force www. sub domain when hitting link without sub domain
- Assert.assertEquals("http://www.example.org/page.html",
- normalizer.normalize("http://example.org/page.html",
- URLNormalizers.SCOPE_DEFAULT));
-
- // Force no sub domain to www. URL's
- Assert.assertEquals("http://example.net/path/to/something.html", normalizer
- .normalize("http://www.example.net/path/to/something.html",
- URLNormalizers.SCOPE_DEFAULT));
-
- // Force all sub domains to www.
- Assert.assertEquals("http://example.com/?does=it&still=work", normalizer
- .normalize("http://example.com/?does=it&still=work",
- URLNormalizers.SCOPE_DEFAULT));
- Assert.assertEquals("http://example.com/buh", normalizer.normalize(
- "http://http.www.example.com/buh", URLNormalizers.SCOPE_DEFAULT));
- Assert.assertEquals("http://example.com/blaat", normalizer.normalize(
- "http://whatever.example.com/blaat", URLNormalizers.SCOPE_DEFAULT));
- }
-}
http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/urlnormalizer-pass/src/test/java/org/apache/nutch/net/urlnormalizer/pass/TestPassURLNormalizer.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlnormalizer-pass/src/test/java/org/apache/nutch/net/urlnormalizer/pass/TestPassURLNormalizer.java b/nutch-plugins/urlnormalizer-pass/src/test/java/org/apache/nutch/net/urlnormalizer/pass/TestPassURLNormalizer.java
new file mode 100644
index 0000000..f470c62
--- /dev/null
+++ b/nutch-plugins/urlnormalizer-pass/src/test/java/org/apache/nutch/net/urlnormalizer/pass/TestPassURLNormalizer.java
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.net.urlnormalizer.pass;
+
+import java.net.MalformedURLException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.net.URLNormalizers;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Test;
+
+public class TestPassURLNormalizer {
+
+ @Test
+ public void testPassURLNormalizer() {
+ Configuration conf = NutchConfiguration.create();
+
+ PassURLNormalizer normalizer = new PassURLNormalizer();
+ normalizer.setConf(conf);
+ String url = "http://www.example.com/test/..//";
+ String result = null;
+ try {
+ result = normalizer.normalize(url, URLNormalizers.SCOPE_DEFAULT);
+ } catch (MalformedURLException mue) {
+ Assert.fail(mue.toString());
+ }
+
+ Assert.assertEquals(url, result);
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/urlnormalizer-pass/src/test/org/apache/nutch/net/urlnormalizer/pass/TestPassURLNormalizer.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlnormalizer-pass/src/test/org/apache/nutch/net/urlnormalizer/pass/TestPassURLNormalizer.java b/nutch-plugins/urlnormalizer-pass/src/test/org/apache/nutch/net/urlnormalizer/pass/TestPassURLNormalizer.java
deleted file mode 100644
index f470c62..0000000
--- a/nutch-plugins/urlnormalizer-pass/src/test/org/apache/nutch/net/urlnormalizer/pass/TestPassURLNormalizer.java
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.net.urlnormalizer.pass;
-
-import java.net.MalformedURLException;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.net.URLNormalizers;
-import org.apache.nutch.util.NutchConfiguration;
-import org.junit.Assert;
-import org.junit.Test;
-
-public class TestPassURLNormalizer {
-
- @Test
- public void testPassURLNormalizer() {
- Configuration conf = NutchConfiguration.create();
-
- PassURLNormalizer normalizer = new PassURLNormalizer();
- normalizer.setConf(conf);
- String url = "http://www.example.com/test/..//";
- String result = null;
- try {
- result = normalizer.normalize(url, URLNormalizers.SCOPE_DEFAULT);
- } catch (MalformedURLException mue) {
- Assert.fail(mue.toString());
- }
-
- Assert.assertEquals(url, result);
- }
-}
http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/urlnormalizer-protocol/src/test/java/org/apache/nutch/net/urlnormalizer/protocol/TestProtocolURLNormalizer.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlnormalizer-protocol/src/test/java/org/apache/nutch/net/urlnormalizer/protocol/TestProtocolURLNormalizer.java b/nutch-plugins/urlnormalizer-protocol/src/test/java/org/apache/nutch/net/urlnormalizer/protocol/TestProtocolURLNormalizer.java
new file mode 100644
index 0000000..8880628
--- /dev/null
+++ b/nutch-plugins/urlnormalizer-protocol/src/test/java/org/apache/nutch/net/urlnormalizer/protocol/TestProtocolURLNormalizer.java
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.net.urlnormalizer.protocol;
+
+import java.net.MalformedURLException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.net.URLNormalizers;
+import org.apache.nutch.util.NutchConfiguration;
+
+import junit.framework.TestCase;
+
+public class TestProtocolURLNormalizer extends TestCase {
+
+ private final static String SEPARATOR = System.getProperty("file.separator");
+ private final static String SAMPLES = System.getProperty("test.data", ".");
+
+ public void testProtocolURLNormalizer() throws Exception {
+ Configuration conf = NutchConfiguration.create();
+
+ String protocolsFile = SAMPLES + SEPARATOR + "protocols.txt";
+ ProtocolURLNormalizer normalizer = new ProtocolURLNormalizer(protocolsFile);
+ normalizer.setConf(conf);
+
+ // No change
+ assertEquals("http://example.org/", normalizer.normalize("https://example.org/", URLNormalizers.SCOPE_DEFAULT));
+ assertEquals("http://example.net/", normalizer.normalize("https://example.net/", URLNormalizers.SCOPE_DEFAULT));
+
+ // https to http
+ assertEquals("http://example.org/", normalizer.normalize("https://example.org/", URLNormalizers.SCOPE_DEFAULT));
+ assertEquals("http://example.net/", normalizer.normalize("https://example.net/", URLNormalizers.SCOPE_DEFAULT));
+
+ // no change
+ assertEquals("https://example.io/", normalizer.normalize("https://example.io/", URLNormalizers.SCOPE_DEFAULT));
+ assertEquals("https://example.nl/", normalizer.normalize("https://example.nl/", URLNormalizers.SCOPE_DEFAULT));
+
+ // http to https
+ assertEquals("https://example.io/", normalizer.normalize("http://example.io/", URLNormalizers.SCOPE_DEFAULT));
+ assertEquals("https://example.nl/", normalizer.normalize("http://example.nl/", URLNormalizers.SCOPE_DEFAULT));
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/urlnormalizer-protocol/src/test/org/apache/nutch/net/urlnormalizer/protocol/TestProtocolURLNormalizer.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlnormalizer-protocol/src/test/org/apache/nutch/net/urlnormalizer/protocol/TestProtocolURLNormalizer.java b/nutch-plugins/urlnormalizer-protocol/src/test/org/apache/nutch/net/urlnormalizer/protocol/TestProtocolURLNormalizer.java
deleted file mode 100644
index 8880628..0000000
--- a/nutch-plugins/urlnormalizer-protocol/src/test/org/apache/nutch/net/urlnormalizer/protocol/TestProtocolURLNormalizer.java
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.net.urlnormalizer.protocol;
-
-import java.net.MalformedURLException;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.net.URLNormalizers;
-import org.apache.nutch.util.NutchConfiguration;
-
-import junit.framework.TestCase;
-
-public class TestProtocolURLNormalizer extends TestCase {
-
- private final static String SEPARATOR = System.getProperty("file.separator");
- private final static String SAMPLES = System.getProperty("test.data", ".");
-
- public void testProtocolURLNormalizer() throws Exception {
- Configuration conf = NutchConfiguration.create();
-
- String protocolsFile = SAMPLES + SEPARATOR + "protocols.txt";
- ProtocolURLNormalizer normalizer = new ProtocolURLNormalizer(protocolsFile);
- normalizer.setConf(conf);
-
- // No change
- assertEquals("http://example.org/", normalizer.normalize("https://example.org/", URLNormalizers.SCOPE_DEFAULT));
- assertEquals("http://example.net/", normalizer.normalize("https://example.net/", URLNormalizers.SCOPE_DEFAULT));
-
- // https to http
- assertEquals("http://example.org/", normalizer.normalize("https://example.org/", URLNormalizers.SCOPE_DEFAULT));
- assertEquals("http://example.net/", normalizer.normalize("https://example.net/", URLNormalizers.SCOPE_DEFAULT));
-
- // no change
- assertEquals("https://example.io/", normalizer.normalize("https://example.io/", URLNormalizers.SCOPE_DEFAULT));
- assertEquals("https://example.nl/", normalizer.normalize("https://example.nl/", URLNormalizers.SCOPE_DEFAULT));
-
- // http to https
- assertEquals("https://example.io/", normalizer.normalize("http://example.io/", URLNormalizers.SCOPE_DEFAULT));
- assertEquals("https://example.nl/", normalizer.normalize("http://example.nl/", URLNormalizers.SCOPE_DEFAULT));
- }
-}
http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/urlnormalizer-querystring/src/test/java/org/apache/nutch/net/urlnormalizer/querystring/TestQuerystringURLNormalizer.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlnormalizer-querystring/src/test/java/org/apache/nutch/net/urlnormalizer/querystring/TestQuerystringURLNormalizer.java b/nutch-plugins/urlnormalizer-querystring/src/test/java/org/apache/nutch/net/urlnormalizer/querystring/TestQuerystringURLNormalizer.java
new file mode 100644
index 0000000..b85c55d
--- /dev/null
+++ b/nutch-plugins/urlnormalizer-querystring/src/test/java/org/apache/nutch/net/urlnormalizer/querystring/TestQuerystringURLNormalizer.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.net.urlnormalizer.querystring;
+
+import java.net.MalformedURLException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.net.URLNormalizers;
+import org.apache.nutch.util.NutchConfiguration;
+
+import junit.framework.TestCase;
+
+public class TestQuerystringURLNormalizer extends TestCase {
+
+ public void testQuerystringURLNormalizer() throws Exception {
+ Configuration conf = NutchConfiguration.create();
+
+ QuerystringURLNormalizer normalizer = new QuerystringURLNormalizer();
+ normalizer.setConf(conf);
+
+ assertEquals("http://example.com/?a=b&c=d", normalizer.normalize(
+ "http://example.com/?c=d&a=b", URLNormalizers.SCOPE_DEFAULT));
+ assertEquals("http://example.com/a/b/c", normalizer.normalize(
+ "http://example.com/a/b/c", URLNormalizers.SCOPE_DEFAULT));
+ assertEquals("http://example.com:1234/a/b/c", normalizer.normalize(
+ "http://example.com:1234/a/b/c", URLNormalizers.SCOPE_DEFAULT));
+ assertEquals("http://example.com:1234/a/b/c#ref", normalizer.normalize(
+ "http://example.com:1234/a/b/c#ref", URLNormalizers.SCOPE_DEFAULT));
+ assertEquals("http://example.com:1234/a/b/c?a=b&c=d#ref",
+ normalizer.normalize("http://example.com:1234/a/b/c?c=d&a=b#ref",
+ URLNormalizers.SCOPE_DEFAULT));
+ assertEquals("http://example.com/?a=b&a=c&c=d", normalizer.normalize(
+ "http://example.com/?c=d&a=b&a=c", URLNormalizers.SCOPE_DEFAULT));
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/urlnormalizer-querystring/src/test/org/apache/nutch/net/urlnormalizer/querystring/TestQuerystringURLNormalizer.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlnormalizer-querystring/src/test/org/apache/nutch/net/urlnormalizer/querystring/TestQuerystringURLNormalizer.java b/nutch-plugins/urlnormalizer-querystring/src/test/org/apache/nutch/net/urlnormalizer/querystring/TestQuerystringURLNormalizer.java
deleted file mode 100644
index b85c55d..0000000
--- a/nutch-plugins/urlnormalizer-querystring/src/test/org/apache/nutch/net/urlnormalizer/querystring/TestQuerystringURLNormalizer.java
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.net.urlnormalizer.querystring;
-
-import java.net.MalformedURLException;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.net.URLNormalizers;
-import org.apache.nutch.util.NutchConfiguration;
-
-import junit.framework.TestCase;
-
-public class TestQuerystringURLNormalizer extends TestCase {
-
- public void testQuerystringURLNormalizer() throws Exception {
- Configuration conf = NutchConfiguration.create();
-
- QuerystringURLNormalizer normalizer = new QuerystringURLNormalizer();
- normalizer.setConf(conf);
-
- assertEquals("http://example.com/?a=b&c=d", normalizer.normalize(
- "http://example.com/?c=d&a=b", URLNormalizers.SCOPE_DEFAULT));
- assertEquals("http://example.com/a/b/c", normalizer.normalize(
- "http://example.com/a/b/c", URLNormalizers.SCOPE_DEFAULT));
- assertEquals("http://example.com:1234/a/b/c", normalizer.normalize(
- "http://example.com:1234/a/b/c", URLNormalizers.SCOPE_DEFAULT));
- assertEquals("http://example.com:1234/a/b/c#ref", normalizer.normalize(
- "http://example.com:1234/a/b/c#ref", URLNormalizers.SCOPE_DEFAULT));
- assertEquals("http://example.com:1234/a/b/c?a=b&c=d#ref",
- normalizer.normalize("http://example.com:1234/a/b/c?c=d&a=b#ref",
- URLNormalizers.SCOPE_DEFAULT));
- assertEquals("http://example.com/?a=b&a=c&c=d", normalizer.normalize(
- "http://example.com/?c=d&a=b&a=c", URLNormalizers.SCOPE_DEFAULT));
- }
-}
http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/urlnormalizer-regex/src/test/java/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlnormalizer-regex/src/test/java/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java b/nutch-plugins/urlnormalizer-regex/src/test/java/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java
new file mode 100644
index 0000000..cbf6c64
--- /dev/null
+++ b/nutch-plugins/urlnormalizer-regex/src/test/java/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java
@@ -0,0 +1,186 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.net.urlnormalizer.regex;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileFilter;
+import java.io.FileInputStream;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.util.*;
+
+import org.junit.Assert;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.net.URLNormalizers;
+import org.apache.nutch.util.NutchConfiguration;
+
+/** Unit tests for RegexUrlNormalizer. */
+public class TestRegexURLNormalizer {
+ private static final Logger LOG = LoggerFactory
+ .getLogger(TestRegexURLNormalizer.class);
+
+ private RegexURLNormalizer normalizer;
+ private Configuration conf;
+ private Map<String, NormalizedURL[]> testData = new HashMap<String, NormalizedURL[]>();
+
+ // This system property is defined in ./src/plugin/build-plugin.xml
+ private String sampleDir = System.getProperty("test.data", ".");
+
+ // Make sure sample files are copied to "test.data" as specified in
+ // ./src/plugin/urlnormalizer-regex/build.xml during plugin compilation.
+
+ public TestRegexURLNormalizer() throws IOException {
+ normalizer = new RegexURLNormalizer();
+ conf = NutchConfiguration.create();
+ normalizer.setConf(conf);
+ File[] configs = new File(sampleDir).listFiles(new FileFilter() {
+ public boolean accept(File f) {
+ if (f.getName().endsWith(".xml")
+ && f.getName().startsWith("regex-normalize-"))
+ return true;
+ return false;
+ }
+ });
+ for (int i = 0; i < configs.length; i++) {
+ try {
+ FileReader reader = new FileReader(configs[i]);
+ String cname = configs[i].getName();
+ cname = cname.substring(16, cname.indexOf(".xml"));
+ normalizer.setConfiguration(reader, cname);
+ NormalizedURL[] urls = readTestFile(cname);
+ testData.put(cname, urls);
+ } catch (Exception e) {
+ LOG.warn("Could load config from '" + configs[i] + "': " + e.toString());
+ }
+ }
+ }
+
+ @Test
+ public void testNormalizerDefault() throws Exception {
+ normalizeTest((NormalizedURL[]) testData.get(URLNormalizers.SCOPE_DEFAULT),
+ URLNormalizers.SCOPE_DEFAULT);
+ }
+
+ @Test
+ public void testNormalizerScope() throws Exception {
+ Iterator<String> it = testData.keySet().iterator();
+ while (it.hasNext()) {
+ String scope = it.next();
+ normalizeTest((NormalizedURL[]) testData.get(scope), scope);
+ }
+ }
+
+ private void normalizeTest(NormalizedURL[] urls, String scope)
+ throws Exception {
+ for (int i = 0; i < urls.length; i++) {
+ String url = urls[i].url;
+ String normalized = normalizer.normalize(urls[i].url, scope);
+ String expected = urls[i].expectedURL;
+ LOG.info("scope: " + scope + " url: " + url + " | normalized: "
+ + normalized + " | expected: " + expected);
+ Assert.assertEquals(urls[i].expectedURL, normalized);
+ }
+ }
+
+ private void bench(int loops, String scope) {
+ long start = System.currentTimeMillis();
+ try {
+ NormalizedURL[] expected = (NormalizedURL[]) testData.get(scope);
+ if (expected == null)
+ return;
+ for (int i = 0; i < loops; i++) {
+ normalizeTest(expected, scope);
+ }
+ } catch (Exception e) {
+ Assert.fail(e.toString());
+ }
+ LOG.info("bench time (" + loops + ") "
+ + (System.currentTimeMillis() - start) + "ms");
+ }
+
+ private static class NormalizedURL {
+ String url;
+ String expectedURL;
+
+ public NormalizedURL(String line) {
+ String[] fields = line.split("\\s+");
+ url = fields[0];
+ expectedURL = fields[1];
+ }
+ }
+
+ private NormalizedURL[] readTestFile(String scope) throws IOException {
+ File f = new File(sampleDir, "regex-normalize-" + scope + ".test");
+ @SuppressWarnings("resource")
+ BufferedReader in = new BufferedReader(new InputStreamReader(
+ new FileInputStream(f), "UTF-8"));
+ List<NormalizedURL> list = new ArrayList<NormalizedURL>();
+ String line;
+ while ((line = in.readLine()) != null) {
+ if (line.trim().length() == 0 || line.startsWith("#")
+ || line.startsWith(" "))
+ continue;
+ list.add(new NormalizedURL(line));
+ }
+ return (NormalizedURL[]) list.toArray(new NormalizedURL[list.size()]);
+ }
+
+ public static void main(String[] args) throws Exception {
+ if (args.length == 0) {
+ System.err.println("TestRegexURLNormalizer [-bench <iter>] <scope>");
+ System.exit(-1);
+ }
+ boolean bench = false;
+ int iter = -1;
+ String scope = null;
+ for (int i = 0; i < args.length; i++) {
+ if (args[i].equals("-bench")) {
+ bench = true;
+ iter = Integer.parseInt(args[++i]);
+ } else
+ scope = args[i];
+ }
+ if (scope == null) {
+ System.err.println("Missing required scope name.");
+ System.exit(-1);
+ }
+ if (bench && iter < 0) {
+ System.err.println("Invalid number of iterations: " + iter);
+ System.exit(-1);
+ }
+ TestRegexURLNormalizer test = new TestRegexURLNormalizer();
+ NormalizedURL[] urls = (NormalizedURL[]) test.testData.get(scope);
+ if (urls == null) {
+ LOG.warn("Missing test data for scope '" + scope
+ + "', using default scope.");
+ scope = URLNormalizers.SCOPE_DEFAULT;
+ urls = (NormalizedURL[]) test.testData.get(scope);
+ }
+ if (bench) {
+ test.bench(iter, scope);
+ } else {
+ test.normalizeTest(urls, scope);
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java b/nutch-plugins/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java
deleted file mode 100644
index cbf6c64..0000000
--- a/nutch-plugins/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java
+++ /dev/null
@@ -1,186 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.net.urlnormalizer.regex;
-
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileFilter;
-import java.io.FileInputStream;
-import java.io.FileReader;
-import java.io.IOException;
-import java.io.InputStreamReader;
-import java.util.*;
-
-import org.junit.Assert;
-import org.junit.Test;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.net.URLNormalizers;
-import org.apache.nutch.util.NutchConfiguration;
-
-/** Unit tests for RegexUrlNormalizer. */
-public class TestRegexURLNormalizer {
- private static final Logger LOG = LoggerFactory
- .getLogger(TestRegexURLNormalizer.class);
-
- private RegexURLNormalizer normalizer;
- private Configuration conf;
- private Map<String, NormalizedURL[]> testData = new HashMap<String, NormalizedURL[]>();
-
- // This system property is defined in ./src/plugin/build-plugin.xml
- private String sampleDir = System.getProperty("test.data", ".");
-
- // Make sure sample files are copied to "test.data" as specified in
- // ./src/plugin/urlnormalizer-regex/build.xml during plugin compilation.
-
- public TestRegexURLNormalizer() throws IOException {
- normalizer = new RegexURLNormalizer();
- conf = NutchConfiguration.create();
- normalizer.setConf(conf);
- File[] configs = new File(sampleDir).listFiles(new FileFilter() {
- public boolean accept(File f) {
- if (f.getName().endsWith(".xml")
- && f.getName().startsWith("regex-normalize-"))
- return true;
- return false;
- }
- });
- for (int i = 0; i < configs.length; i++) {
- try {
- FileReader reader = new FileReader(configs[i]);
- String cname = configs[i].getName();
- cname = cname.substring(16, cname.indexOf(".xml"));
- normalizer.setConfiguration(reader, cname);
- NormalizedURL[] urls = readTestFile(cname);
- testData.put(cname, urls);
- } catch (Exception e) {
- LOG.warn("Could load config from '" + configs[i] + "': " + e.toString());
- }
- }
- }
-
- @Test
- public void testNormalizerDefault() throws Exception {
- normalizeTest((NormalizedURL[]) testData.get(URLNormalizers.SCOPE_DEFAULT),
- URLNormalizers.SCOPE_DEFAULT);
- }
-
- @Test
- public void testNormalizerScope() throws Exception {
- Iterator<String> it = testData.keySet().iterator();
- while (it.hasNext()) {
- String scope = it.next();
- normalizeTest((NormalizedURL[]) testData.get(scope), scope);
- }
- }
-
- private void normalizeTest(NormalizedURL[] urls, String scope)
- throws Exception {
- for (int i = 0; i < urls.length; i++) {
- String url = urls[i].url;
- String normalized = normalizer.normalize(urls[i].url, scope);
- String expected = urls[i].expectedURL;
- LOG.info("scope: " + scope + " url: " + url + " | normalized: "
- + normalized + " | expected: " + expected);
- Assert.assertEquals(urls[i].expectedURL, normalized);
- }
- }
-
- private void bench(int loops, String scope) {
- long start = System.currentTimeMillis();
- try {
- NormalizedURL[] expected = (NormalizedURL[]) testData.get(scope);
- if (expected == null)
- return;
- for (int i = 0; i < loops; i++) {
- normalizeTest(expected, scope);
- }
- } catch (Exception e) {
- Assert.fail(e.toString());
- }
- LOG.info("bench time (" + loops + ") "
- + (System.currentTimeMillis() - start) + "ms");
- }
-
- private static class NormalizedURL {
- String url;
- String expectedURL;
-
- public NormalizedURL(String line) {
- String[] fields = line.split("\\s+");
- url = fields[0];
- expectedURL = fields[1];
- }
- }
-
- private NormalizedURL[] readTestFile(String scope) throws IOException {
- File f = new File(sampleDir, "regex-normalize-" + scope + ".test");
- @SuppressWarnings("resource")
- BufferedReader in = new BufferedReader(new InputStreamReader(
- new FileInputStream(f), "UTF-8"));
- List<NormalizedURL> list = new ArrayList<NormalizedURL>();
- String line;
- while ((line = in.readLine()) != null) {
- if (line.trim().length() == 0 || line.startsWith("#")
- || line.startsWith(" "))
- continue;
- list.add(new NormalizedURL(line));
- }
- return (NormalizedURL[]) list.toArray(new NormalizedURL[list.size()]);
- }
-
- public static void main(String[] args) throws Exception {
- if (args.length == 0) {
- System.err.println("TestRegexURLNormalizer [-bench <iter>] <scope>");
- System.exit(-1);
- }
- boolean bench = false;
- int iter = -1;
- String scope = null;
- for (int i = 0; i < args.length; i++) {
- if (args[i].equals("-bench")) {
- bench = true;
- iter = Integer.parseInt(args[++i]);
- } else
- scope = args[i];
- }
- if (scope == null) {
- System.err.println("Missing required scope name.");
- System.exit(-1);
- }
- if (bench && iter < 0) {
- System.err.println("Invalid number of iterations: " + iter);
- System.exit(-1);
- }
- TestRegexURLNormalizer test = new TestRegexURLNormalizer();
- NormalizedURL[] urls = (NormalizedURL[]) test.testData.get(scope);
- if (urls == null) {
- LOG.warn("Missing test data for scope '" + scope
- + "', using default scope.");
- scope = URLNormalizers.SCOPE_DEFAULT;
- urls = (NormalizedURL[]) test.testData.get(scope);
- }
- if (bench) {
- test.bench(iter, scope);
- } else {
- test.normalizeTest(urls, scope);
- }
- }
-
-}
http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/urlnormalizer-slash/src/test/java/org/apache/nutch/net/urlnormalizer/slash/TestSlashURLNormalizer.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlnormalizer-slash/src/test/java/org/apache/nutch/net/urlnormalizer/slash/TestSlashURLNormalizer.java b/nutch-plugins/urlnormalizer-slash/src/test/java/org/apache/nutch/net/urlnormalizer/slash/TestSlashURLNormalizer.java
new file mode 100644
index 0000000..c3585e4
--- /dev/null
+++ b/nutch-plugins/urlnormalizer-slash/src/test/java/org/apache/nutch/net/urlnormalizer/slash/TestSlashURLNormalizer.java
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.net.urlnormalizer.slash;
+
+import java.net.MalformedURLException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.net.URLNormalizers;
+import org.apache.nutch.util.NutchConfiguration;
+
+import junit.framework.TestCase;
+
+public class TestSlashURLNormalizer extends TestCase {
+
+ private final static String SEPARATOR = System.getProperty("file.separator");
+ private final static String SAMPLES = System.getProperty("test.data", ".");
+
+ public void testSlashURLNormalizer() throws Exception {
+ Configuration conf = NutchConfiguration.create();
+
+ String slashesFile = SAMPLES + SEPARATOR + "slashes.txt";
+ SlashURLNormalizer normalizer = new SlashURLNormalizer(slashesFile);
+ normalizer.setConf(conf);
+
+ // No change
+ assertEquals("http://example.org/", normalizer.normalize("http://example.org/", URLNormalizers.SCOPE_DEFAULT));
+ assertEquals("http://example.net/", normalizer.normalize("http://example.net/", URLNormalizers.SCOPE_DEFAULT));
+
+ // Don't touch base URL's
+ assertEquals("http://example.org", normalizer.normalize("http://example.org", URLNormalizers.SCOPE_DEFAULT));
+ assertEquals("http://example.net", normalizer.normalize("http://example.net", URLNormalizers.SCOPE_DEFAULT));
+ assertEquals("http://example.org/", normalizer.normalize("http://example.org/", URLNormalizers.SCOPE_DEFAULT));
+ assertEquals("http://example.net/", normalizer.normalize("http://example.net/", URLNormalizers.SCOPE_DEFAULT));
+
+ // Change
+ assertEquals("http://www.example.org/page/", normalizer.normalize("http://www.example.org/page", URLNormalizers.SCOPE_DEFAULT));
+ assertEquals("http://www.example.net/path/to/something", normalizer.normalize("http://www.example.net/path/to/something/", URLNormalizers.SCOPE_DEFAULT));
+
+ // No change
+ assertEquals("http://example.org/buh/", normalizer.normalize("http://example.org/buh/", URLNormalizers.SCOPE_DEFAULT));
+ assertEquals("http://example.net/blaat", normalizer.normalize("http://example.net/blaat", URLNormalizers.SCOPE_DEFAULT));
+
+ // No change
+ assertEquals("http://example.nl/buh/", normalizer.normalize("http://example.nl/buh/", URLNormalizers.SCOPE_DEFAULT));
+ assertEquals("http://example.de/blaat", normalizer.normalize("http://example.de/blaat", URLNormalizers.SCOPE_DEFAULT));
+
+ // Change
+ assertEquals("http://www.example.org/page/?a=b&c=d", normalizer.normalize("http://www.example.org/page?a=b&c=d", URLNormalizers.SCOPE_DEFAULT));
+ assertEquals("http://www.example.net/path/to/something?a=b&c=d", normalizer.normalize("http://www.example.net/path/to/something/?a=b&c=d", URLNormalizers.SCOPE_DEFAULT));
+
+ // No change
+ assertEquals("http://www.example.org/noise.mp3", normalizer.normalize("http://www.example.org/noise.mp3", URLNormalizers.SCOPE_DEFAULT));
+ assertEquals("http://www.example.org/page.html", normalizer.normalize("http://www.example.org/page.html", URLNormalizers.SCOPE_DEFAULT));
+ assertEquals("http://www.example.org/page.shtml", normalizer.normalize("http://www.example.org/page.shtml", URLNormalizers.SCOPE_DEFAULT));
+
+ // Change
+ assertEquals("http://www.example.org/this.is.not.an_extension/", normalizer.normalize("http://www.example.org/this.is.not.an_extension", URLNormalizers.SCOPE_DEFAULT));
+ }
+}