You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by th...@apache.org on 2016/07/05 22:49:08 UTC

[24/69] [abbrv] [partial] nutch git commit: Re arranged the source code as per maven conventions for build

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-ajax/src/test/org/apache/nutch/net/urlnormalizer/ajax/TestAjaxURLNormalizer.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlnormalizer-ajax/src/test/org/apache/nutch/net/urlnormalizer/ajax/TestAjaxURLNormalizer.java b/nutch-plugins/urlnormalizer-ajax/src/test/org/apache/nutch/net/urlnormalizer/ajax/TestAjaxURLNormalizer.java
new file mode 100644
index 0000000..d815c45
--- /dev/null
+++ b/nutch-plugins/urlnormalizer-ajax/src/test/org/apache/nutch/net/urlnormalizer/ajax/TestAjaxURLNormalizer.java
@@ -0,0 +1,67 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.net.urlnormalizer.ajax;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.net.URLNormalizers;
+import org.apache.nutch.util.NutchConfiguration;
+
+import junit.framework.TestCase;
+
+/** Unit tests for AjaxURLNormalizer. */
+public class TestAjaxURLNormalizer extends TestCase {
+  private AjaxURLNormalizer normalizer;
+  private Configuration conf;
+  
+  public TestAjaxURLNormalizer(String name) {
+    super(name);
+    normalizer = new AjaxURLNormalizer();
+    conf = NutchConfiguration.create();
+    normalizer.setConf(conf);
+  }
+
+  public void testNormalizer() throws Exception {
+    // check if AJAX URL's are normalized to an _escaped_frament_ form
+    normalizeTest("http://example.org/#!k=v", "http://example.org/?_escaped_fragment_=k=v");
+
+    // Check with some escaped chars
+    normalizeTest("http://example.org/#!k=v&something=is wrong", "http://example.org/?_escaped_fragment_=k=v%26something=is%20wrong");
+
+    // Check with query string and multiple fragment params
+    normalizeTest("http://example.org/path.html?queryparam=queryvalue#!key1=value1&key2=value2", "http://example.org/path.html?queryparam=queryvalue&_escaped_fragment_=key1=value1%26key2=value2");
+  }
+  
+  public void testNormalizerWhenIndexing() throws Exception {
+    // check if it works the other way around
+    normalizeTest("http://example.org/?_escaped_fragment_=key=value", "http://example.org/#!key=value", URLNormalizers.SCOPE_INDEXER);
+    normalizeTest("http://example.org/?key=value&_escaped_fragment_=key=value", "http://example.org/?key=value#!key=value", URLNormalizers.SCOPE_INDEXER);
+    normalizeTest("http://example.org/page.html?key=value&_escaped_fragment_=key=value%26something=is%20wrong", "http://example.org/page.html?key=value#!key=value&something=is wrong", URLNormalizers.SCOPE_INDEXER);
+  }
+
+  private void normalizeTest(String weird, String normal) throws Exception {
+    assertEquals(normal, normalizer.normalize(weird, URLNormalizers.SCOPE_DEFAULT));
+  }
+  
+  private void normalizeTest(String weird, String normal, String scope) throws Exception {
+    assertEquals(normal, normalizer.normalize(weird, scope));
+  }
+
+  public static void main(String[] args) throws Exception {
+    new TestAjaxURLNormalizer("test").testNormalizer();
+  }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-basic/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlnormalizer-basic/build.xml b/nutch-plugins/urlnormalizer-basic/build.xml
new file mode 100644
index 0000000..5a74bb0
--- /dev/null
+++ b/nutch-plugins/urlnormalizer-basic/build.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="urlnormalizer-basic" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-basic/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlnormalizer-basic/ivy.xml b/nutch-plugins/urlnormalizer-basic/ivy.xml
new file mode 100644
index 0000000..1a86d68
--- /dev/null
+++ b/nutch-plugins/urlnormalizer-basic/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-basic/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlnormalizer-basic/plugin.xml b/nutch-plugins/urlnormalizer-basic/plugin.xml
new file mode 100644
index 0000000..fb505aa
--- /dev/null
+++ b/nutch-plugins/urlnormalizer-basic/plugin.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="urlnormalizer-basic"
+   name="Basic URL Normalizer"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="urlnormalizer-basic.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+   <extension id="org.apache.nutch.net.urlnormalizer.basic"
+              name="Nutch Basic URL Normalizer"
+              point="org.apache.nutch.net.URLNormalizer">
+      <implementation id="BasicURLNormalizer"
+                      class="org.apache.nutch.net.urlnormalizer.basic.BasicURLNormalizer"/>
+   </extension>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-basic/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlnormalizer-basic/pom.xml b/nutch-plugins/urlnormalizer-basic/pom.xml
new file mode 100644
index 0000000..d87f112
--- /dev/null
+++ b/nutch-plugins/urlnormalizer-basic/pom.xml
@@ -0,0 +1,38 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>urlnormalizer-basic</artifactId>
+    <packaging>jar</packaging>
+
+    <name>urlnormalizer-basic</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-basic/src/main/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlnormalizer-basic/src/main/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java b/nutch-plugins/urlnormalizer-basic/src/main/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
new file mode 100644
index 0000000..3e00346
--- /dev/null
+++ b/nutch-plugins/urlnormalizer-basic/src/main/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
@@ -0,0 +1,290 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.net.urlnormalizer.basic;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.net.MalformedURLException;
+import java.net.URISyntaxException;
+import java.net.URL;
+import java.nio.charset.Charset;
+import java.util.Locale;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.hadoop.conf.Configured;
+import org.apache.nutch.net.URLNormalizer;
+import org.apache.nutch.net.URLNormalizers;
+import org.apache.nutch.util.NutchConfiguration;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Converts URLs to a normal form:
+ * <ul>
+ * <li>remove dot segments in path: <code>/./</code> or <code>/../</code></li>
+ * <li>remove default ports, e.g. 80 for protocol <code>http://</code></li>
+ * <li>normalize <a href=
+ * "https://en.wikipedia.org/wiki/Percent-encoding#Percent-encoding_in_a_URI">
+ * percent-encoding</a> in URL paths</li>
+ * </ul>
+ */
+public class BasicURLNormalizer extends Configured implements URLNormalizer {
+  public static final Logger LOG = LoggerFactory
+      .getLogger(BasicURLNormalizer.class);
+
+  /**
+   * Pattern to detect whether a URL path could be normalized. Contains one of
+   * /. or ./ /.. or ../ //
+   */
+  private final static Pattern hasNormalizablePathPattern = Pattern
+      .compile("/[./]|[.]/");
+
+  /**
+   * Nutch 1098 - finds URL encoded parts of the URL
+   */
+  private final static Pattern unescapeRulePattern = Pattern
+      .compile("%([0-9A-Fa-f]{2})");
+  
+  // charset used for encoding URLs before escaping
+  private final static Charset utf8 = Charset.forName("UTF-8");
+
+  /** look-up table for characters which should not be escaped in URL paths */
+  private final static boolean[] unescapedCharacters = new boolean[128];
+  static {
+    for (int c = 0; c < 128; c++) {
+      /* https://tools.ietf.org/html/rfc3986#section-2.2
+       * For consistency, percent-encoded octets in the ranges of ALPHA
+       * (%41-%5A and %61-%7A), DIGIT (%30-%39), hyphen (%2D), period (%2E),
+       * underscore (%5F), or tilde (%7E) should not be created by URI
+       * producers and, when found in a URI, should be decoded to their
+       * corresponding unreserved characters by URI normalizers.
+       */
+      if ((0x41 <= c && c <= 0x5A)
+        || (0x61 <= c && c <= 0x7A)
+        || (0x30 <= c && c <= 0x39)
+        || c == 0x2D || c == 0x2E
+        || c == 0x5F || c == 0x7E) {
+        unescapedCharacters[c] = true;
+      } else {
+        unescapedCharacters[c] = false;
+      }
+    }
+  }
+
+  public String normalize(String urlString, String scope)
+      throws MalformedURLException {
+    
+    if ("".equals(urlString)) // permit empty
+      return urlString;
+
+    urlString = urlString.trim(); // remove extra spaces
+
+    URL url = new URL(urlString);
+
+    String protocol = url.getProtocol();
+    String host = url.getHost();
+    int port = url.getPort();
+    String file = url.getFile();
+
+    boolean changed = false;
+
+    if (!urlString.startsWith(protocol)) // protocol was lowercased
+      changed = true;
+
+    if ("http".equals(protocol) || "https".equals(protocol)
+        || "ftp".equals(protocol)) {
+
+      if (host != null) {
+        String newHost = host.toLowerCase(); // lowercase host
+        if (!host.equals(newHost)) {
+          host = newHost;
+          changed = true;
+        }
+      }
+
+      if (port == url.getDefaultPort()) { // uses default port
+        port = -1; // so don't specify it
+        changed = true;
+      }
+
+      if (file == null || "".equals(file)) { // add a slash
+        file = "/";
+        changed = true;
+      }
+
+      if (url.getRef() != null) { // remove the ref
+        changed = true;
+      }
+
+      // check for unnecessary use of "/../", "/./", and "//"
+      String file2 = getFileWithNormalizedPath(url);
+      if (!file.equals(file2)) {
+        changed = true;
+        file = file2;
+      }
+    }
+
+    // properly encode characters in path/file using percent-encoding
+    String file2 = unescapePath(file);
+    file2 = escapePath(file2);
+    if (!file.equals(file2)) {
+      changed = true;
+      file = file2;
+    }
+
+    if (changed)
+      urlString = new URL(protocol, host, port, file).toString();
+
+    return urlString;
+  }
+
+  private String getFileWithNormalizedPath(URL url)
+      throws MalformedURLException {
+    String file;
+
+    if (hasNormalizablePathPattern.matcher(url.getPath()).find()) {
+      // only normalize the path if there is something to normalize
+      // to avoid needless work
+      try {
+        file = url.toURI().normalize().toURL().getFile();
+        // URI.normalize() does not normalize leading dot segments,
+        // see also http://tools.ietf.org/html/rfc3986#section-5.2.4
+        int start = 0;
+        while (file.startsWith("/../", start)) {
+          start += 3;
+        }
+        if (start > 0) {
+          file = file.substring(start);
+        }
+      } catch (URISyntaxException e) {
+        file = url.getFile();
+      }
+    } else {
+      file = url.getFile();
+    }
+
+    // if path is empty return a single slash
+    if (file.isEmpty()) {
+      file = "/";
+    }
+
+    return file;
+  }
+  
+  /**
+   * Remove % encoding from path segment in URL for characters which should be
+   * unescaped according to <a
+   * href="https://tools.ietf.org/html/rfc3986#section-2.2">RFC3986</a>.
+   */
+  private String unescapePath(String path) {
+    StringBuilder sb = new StringBuilder();
+    
+    Matcher matcher = unescapeRulePattern.matcher(path);
+    
+    int end = -1;
+    int letter;
+
+    // Traverse over all encoded groups
+    while (matcher.find()) {
+      // Append everything up to this group
+      sb.append(path.substring(end + 1, matcher.start()));
+      
+      // Get the integer representation of this hexadecimal encoded character
+      letter = Integer.valueOf(matcher.group().substring(1), 16);
+
+      if (letter < 128 && unescapedCharacters[letter]) {
+        // character should be unescaped in URLs
+        sb.append(new Character((char)letter));
+      } else {
+        // Append the encoded character as uppercase
+        sb.append(matcher.group().toUpperCase(Locale.ROOT));
+      }
+      
+      end = matcher.start() + 2;
+    }
+    
+    letter = path.length();
+    
+    // Append the rest if there's anything
+    if (end <= letter - 1) {
+      sb.append(path.substring(end + 1, letter));
+    }
+
+    // Ok!
+    return sb.toString();
+  }
+
+  /**
+   * Convert path segment of URL from Unicode to UTF-8 and escape all
+   * characters which should be escaped according to <a
+   * href="https://tools.ietf.org/html/rfc3986#section-2.2">RFC3986</a>..
+   */
+  private String escapePath(String path) {
+    StringBuilder sb = new StringBuilder(path.length());
+
+    // Traverse over all bytes in this URL
+    for (byte b: path.getBytes(utf8)) {
+      // Is this a control character?
+      if (b < 33 || b == 91 || b == 93) {
+        // Start escape sequence 
+        sb.append('%');
+        
+        // Get this byte's hexadecimal representation 
+        String hex = Integer.toHexString(b & 0xFF).toUpperCase();
+        
+        // Do we need to prepend a zero?
+        if (hex.length() % 2 != 0 ) {
+          sb.append('0');
+          sb.append(hex);
+        } else {
+          // No, append this hexadecimal representation
+          sb.append(hex);
+        }
+      } else {
+        // No, just append this character as-is
+        sb.append((char)b);
+      }
+    }
+    
+    return sb.toString();
+  }
+
+  public static void main(String args[]) throws IOException {
+    BasicURLNormalizer normalizer = new BasicURLNormalizer();
+    normalizer.setConf(NutchConfiguration.create());
+    String scope = URLNormalizers.SCOPE_DEFAULT;
+    if (args.length >= 1) {
+      scope = args[0];
+      System.out.println("Scope: " + scope);
+    }
+    String line, normUrl;
+    BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
+    while ((line = in.readLine()) != null) {
+      try {
+        normUrl = normalizer.normalize(line, scope);
+        System.out.println(normUrl);
+      } catch (MalformedURLException e) {
+        System.out.println("failed: " + line);
+      }
+    }
+    System.exit(0);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-basic/src/main/java/org/apache/nutch/net/urlnormalizer/basic/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlnormalizer-basic/src/main/java/org/apache/nutch/net/urlnormalizer/basic/package-info.java b/nutch-plugins/urlnormalizer-basic/src/main/java/org/apache/nutch/net/urlnormalizer/basic/package-info.java
new file mode 100644
index 0000000..ae59a84
--- /dev/null
+++ b/nutch-plugins/urlnormalizer-basic/src/main/java/org/apache/nutch/net/urlnormalizer/basic/package-info.java
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * URL normalizer performing basic normalizations: remove default ports
+ * and dot segments in path.
+ */
+package org.apache.nutch.net.urlnormalizer.basic;
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java b/nutch-plugins/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
new file mode 100644
index 0000000..9a0f8c4
--- /dev/null
+++ b/nutch-plugins/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
@@ -0,0 +1,175 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.net.urlnormalizer.basic;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.net.URLNormalizers;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Test;
+
+/** Unit tests for BasicURLNormalizer. */
+public class TestBasicURLNormalizer {
+  private BasicURLNormalizer normalizer;
+
+  private Configuration conf;
+
+  public TestBasicURLNormalizer() {
+    normalizer = new BasicURLNormalizer();
+    conf = NutchConfiguration.create();
+    normalizer.setConf(conf);
+  }
+  
+  @Test
+  public void testNUTCH1098() throws Exception {
+    // check that % encoding is normalized
+    normalizeTest("http://foo.com/%66oo.html", "http://foo.com/foo.html");
+
+    // check that % encoding works correctly at end of URL
+    normalizeTest("http://foo.com/%66oo.htm%6c", "http://foo.com/foo.html");
+    normalizeTest("http://foo.com/%66oo.ht%6dl", "http://foo.com/foo.html");
+
+    // check that % decoder do not overlap strings
+    normalizeTest("http://foo.com/%66oo.ht%6d%6c", "http://foo.com/foo.html");
+    
+    // check that % decoder leaves high bit chars alone
+    normalizeTest("http://foo.com/%66oo.htm%C0", "http://foo.com/foo.htm%C0");
+
+    // check that % decoder leaves control chars alone
+    normalizeTest("http://foo.com/%66oo.htm%1A", "http://foo.com/foo.htm%1A");
+
+    // check that % decoder converts to upper case letters
+    normalizeTest("http://foo.com/%66oo.htm%c0", "http://foo.com/foo.htm%C0");
+
+    // check that % decoder leaves encoded spaces alone
+    normalizeTest("http://foo.com/you%20too.html", "http://foo.com/you%20too.html");
+
+    // check that spaces are encoded into %20
+    normalizeTest("http://foo.com/you too.html", "http://foo.com/you%20too.html");
+
+    // check that encoded # are not decoded
+    normalizeTest("http://foo.com/file.html%23cz", "http://foo.com/file.html%23cz");
+
+    // check that encoded / are not decoded
+    normalizeTest("http://foo.com/fast/dir%2fcz", "http://foo.com/fast/dir%2Fcz");
+
+    // check that control chars are encoded
+    normalizeTest("http://foo.com/\u001a!", "http://foo.com/%1A!");
+
+    // check that control chars are always encoded into 2 digits
+    normalizeTest("http://foo.com/\u0001!", "http://foo.com/%01!");
+
+    // check encoding of spanish chars
+    normalizeTest("http://mydomain.com/en Espa\u00F1ol.aspx", "http://mydomain.com/en%20Espa%C3%B1ol.aspx");
+  }
+  
+  @Test
+  public void testNUTCH2064() throws Exception {
+    // Ampersand and colon and other punctuation characters are not to be unescaped
+    normalizeTest("http://x.com/s?q=a%26b&m=10", "http://x.com/s?q=a%26b&m=10");
+    normalizeTest("http://x.com/show?http%3A%2F%2Fx.com%2Fb",
+        "http://x.com/show?http%3A%2F%2Fx.com%2Fb");
+    normalizeTest("http://google.com/search?q=c%2B%2B",
+        "http://google.com/search?q=c%2B%2B");
+    // do also not touch the query part which is application/x-www-form-urlencoded
+    normalizeTest("http://x.com/s?q=a+b", "http://x.com/s?q=a+b");
+    // and keep Internationalized domain names
+    // http://b�cher.de/ may be http://xn--bcher-kva.de/
+    // but definitely not http://b%C3%BCcher.de/
+    normalizeTest("http://b\u00fccher.de/", "http://b\u00fccher.de/");
+    // test whether percent-encoding works together with other normalizations
+    normalizeTest("http://x.com/./a/../%66.html", "http://x.com/f.html");
+    // [ and ] need escaping as well
+    normalizeTest("http://x.com/?x[y]=1", "http://x.com/?x%5By%5D=1");
+    // boundary test for first character outside the ASCII range (U+0080)
+    normalizeTest("http://x.com/foo\u0080", "http://x.com/foo%C2%80");
+    normalizeTest("http://x.com/foo%c2%80", "http://x.com/foo%C2%80");
+  }
+
+  @Test
+  public void testNormalizer() throws Exception {
+    // check that leading and trailing spaces are removed
+    normalizeTest(" http://foo.com/ ", "http://foo.com/");
+
+    // check that protocol is lower cased
+    normalizeTest("HTTP://foo.com/", "http://foo.com/");
+
+    // check that host is lower cased
+    normalizeTest("http://Foo.Com/index.html", "http://foo.com/index.html");
+    normalizeTest("http://Foo.Com/index.html", "http://foo.com/index.html");
+
+    // check that port number is normalized
+    normalizeTest("http://foo.com:80/index.html", "http://foo.com/index.html");
+    normalizeTest("http://foo.com:81/", "http://foo.com:81/");
+
+    // check that null path is normalized
+    normalizeTest("http://foo.com", "http://foo.com/");
+
+    // check that references are removed
+    normalizeTest("http://foo.com/foo.html#ref", "http://foo.com/foo.html");
+
+    // // check that encoding is normalized
+    // normalizeTest("http://foo.com/%66oo.html", "http://foo.com/foo.html");
+
+    // check that unnecessary "../" are removed
+
+    normalizeTest("http://foo.com/aa/./foo.html", "http://foo.com/aa/foo.html");
+    normalizeTest("http://foo.com/aa/../", "http://foo.com/");
+    normalizeTest("http://foo.com/aa/bb/../", "http://foo.com/aa/");
+    normalizeTest("http://foo.com/aa/..", "http://foo.com/");
+    normalizeTest("http://foo.com/aa/bb/cc/../../foo.html",
+        "http://foo.com/aa/foo.html");
+    normalizeTest("http://foo.com/aa/bb/../cc/dd/../ee/foo.html",
+        "http://foo.com/aa/cc/ee/foo.html");
+    normalizeTest("http://foo.com/../foo.html", "http://foo.com/foo.html");
+    normalizeTest("http://foo.com/../../foo.html", "http://foo.com/foo.html");
+    normalizeTest("http://foo.com/../aa/../foo.html", "http://foo.com/foo.html");
+    normalizeTest("http://foo.com/aa/../../foo.html", "http://foo.com/foo.html");
+    normalizeTest("http://foo.com/aa/../bb/../foo.html/../../",
+        "http://foo.com/");
+    normalizeTest("http://foo.com/../aa/foo.html", "http://foo.com/aa/foo.html");
+    normalizeTest("http://foo.com/../aa/../foo.html", "http://foo.com/foo.html");
+    normalizeTest("http://foo.com/a..a/foo.html",
+        "http://foo.com/a..a/foo.html");
+    normalizeTest("http://foo.com/a..a/../foo.html", "http://foo.com/foo.html");
+    normalizeTest("http://foo.com/foo.foo/../foo.html",
+        "http://foo.com/foo.html");
+    normalizeTest("http://foo.com//aa/bb/foo.html",
+        "http://foo.com/aa/bb/foo.html");
+    normalizeTest("http://foo.com/aa//bb/foo.html",
+        "http://foo.com/aa/bb/foo.html");
+    normalizeTest("http://foo.com/aa/bb//foo.html",
+        "http://foo.com/aa/bb/foo.html");
+    normalizeTest("http://foo.com//aa//bb//foo.html",
+        "http://foo.com/aa/bb/foo.html");
+    normalizeTest("http://foo.com////aa////bb////foo.html",
+        "http://foo.com/aa/bb/foo.html");
+    normalizeTest("http://foo.com/aa?referer=http://bar.com",
+        "http://foo.com/aa?referer=http://bar.com");
+  }
+
+  private void normalizeTest(String weird, String normal) throws Exception {
+    Assert.assertEquals("normalizing: " + weird, normal,
+        normalizer.normalize(weird, URLNormalizers.SCOPE_DEFAULT));
+  }
+
+  public static void main(String[] args) throws Exception {
+    new TestBasicURLNormalizer().testNormalizer();
+  }
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-host/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlnormalizer-host/build.xml b/nutch-plugins/urlnormalizer-host/build.xml
new file mode 100644
index 0000000..516596d
--- /dev/null
+++ b/nutch-plugins/urlnormalizer-host/build.xml
@@ -0,0 +1,27 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="urlnormalizer-host" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+  <!-- for junit test -->
+  <mkdir dir="${build.test}/data"/>
+  <copy todir="${build.test}/data">
+    <fileset dir="data" />
+  </copy>
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-host/data/hosts.txt
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlnormalizer-host/data/hosts.txt b/nutch-plugins/urlnormalizer-host/data/hosts.txt
new file mode 100644
index 0000000..c7e0ccf
--- /dev/null
+++ b/nutch-plugins/urlnormalizer-host/data/hosts.txt
@@ -0,0 +1,8 @@
+# Force all sub domains to www.
+*.example.com example.com
+
+# Force no sub domain to www. URL's
+www.example.net example.net
+
+# Force www. sub domain when hitting link without sub domain
+example.org www.example.org
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-host/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlnormalizer-host/ivy.xml b/nutch-plugins/urlnormalizer-host/ivy.xml
new file mode 100644
index 0000000..0a363f7
--- /dev/null
+++ b/nutch-plugins/urlnormalizer-host/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="${nutch.root}/ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-host/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlnormalizer-host/plugin.xml b/nutch-plugins/urlnormalizer-host/plugin.xml
new file mode 100644
index 0000000..f2b9615
--- /dev/null
+++ b/nutch-plugins/urlnormalizer-host/plugin.xml
@@ -0,0 +1,43 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="urlnormalizer-host"
+   name="Host URL Normalizer"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="urlnormalizer-host.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+   <extension id="org.apache.nutch.net.urlnormalizer.host"
+              name="Nutch Host URL Normalizer"
+              point="org.apache.nutch.net.URLNormalizer">
+      <implementation id="HostURLNormalizer"
+                      class="org.apache.nutch.net.urlnormalizer.host.HostURLNormalizer">
+        <parameter name="file" value="host-urlnormalizer.txt"/>
+      </implementation>
+   </extension>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-host/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlnormalizer-host/pom.xml b/nutch-plugins/urlnormalizer-host/pom.xml
new file mode 100644
index 0000000..217029e
--- /dev/null
+++ b/nutch-plugins/urlnormalizer-host/pom.xml
@@ -0,0 +1,38 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>urlnormalizer-host</artifactId>
+    <packaging>jar</packaging>
+
+    <name>urlnormalizer-host</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-host/src/main/java/org/apache/nutch/net/urlnormalizer/host/HostURLNormalizer.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlnormalizer-host/src/main/java/org/apache/nutch/net/urlnormalizer/host/HostURLNormalizer.java b/nutch-plugins/urlnormalizer-host/src/main/java/org/apache/nutch/net/urlnormalizer/host/HostURLNormalizer.java
new file mode 100644
index 0000000..8d5c110
--- /dev/null
+++ b/nutch-plugins/urlnormalizer-host/src/main/java/org/apache/nutch/net/urlnormalizer/host/HostURLNormalizer.java
@@ -0,0 +1,198 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.net.urlnormalizer.host;
+
+import java.io.BufferedReader;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.HashMap;
+
+import org.apache.commons.lang.StringUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.net.URLNormalizer;
+import org.apache.nutch.plugin.Extension;
+import org.apache.nutch.plugin.PluginRepository;
+import org.apache.nutch.util.URLUtil;
+
+/**
+ * URL normalizer for mapping hosts to their desired form. It takes a simple
+ * text file as source in the format:
+ * 
+ * example.org www.example.org
+ * 
+ * mapping all URL's of example.org the the www sub-domain. It also allows for
+ * wildcards to be used to map all sub-domains to another host:
+ * 
+ * *.example.org www.example.org
+ */
+public class HostURLNormalizer implements URLNormalizer {
+
+  private Configuration conf;
+
+  private static final Logger LOG = LoggerFactory
+      .getLogger(HostURLNormalizer.class);
+
+  private static String attributeFile = null;
+  private String hostsFile = null;
+  private static final HashMap<String, String> hostsMap = new HashMap<String, String>();
+
+  public HostURLNormalizer() {
+  }
+
+  public HostURLNormalizer(String hostsFile) {
+    this.hostsFile = hostsFile;
+  }
+
+  private synchronized void readConfiguration(Reader configReader)
+      throws IOException {
+    if (hostsMap.size() > 0) {
+      return;
+    }
+
+    BufferedReader reader = new BufferedReader(configReader);
+    String line, host, target;
+    int delimiterIndex;
+
+    while ((line = reader.readLine()) != null) {
+      if (StringUtils.isNotBlank(line) && !line.startsWith("#")) {
+        line.trim();
+        delimiterIndex = line.indexOf(" ");
+
+        host = line.substring(0, delimiterIndex);
+        target = line.substring(delimiterIndex + 1);
+        hostsMap.put(host, target);
+      }
+    }
+  }
+
+  public Configuration getConf() {
+    return conf;
+  }
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+
+    // get the extensions for domain urlfilter
+    String pluginName = "urlnormalizer-host";
+    Extension[] extensions = PluginRepository.get(conf)
+        .getExtensionPoint(URLNormalizer.class.getName()).getExtensions();
+    for (int i = 0; i < extensions.length; i++) {
+      Extension extension = extensions[i];
+      if (extension.getDescriptor().getPluginId().equals(pluginName)) {
+        attributeFile = extension.getAttribute("file");
+        break;
+      }
+    }
+
+    // handle blank non empty input
+    if (attributeFile != null && attributeFile.trim().equals("")) {
+      attributeFile = null;
+    }
+
+    if (attributeFile != null) {
+      if (LOG.isInfoEnabled()) {
+        LOG.info("Attribute \"file\" is defined for plugin " + pluginName
+            + " as " + attributeFile);
+      }
+    } else {
+      if (LOG.isWarnEnabled()) {
+        LOG.warn("Attribute \"file\" is not defined in plugin.xml for plugin "
+            + pluginName);
+      }
+    }
+
+    // domain file and attribute "file" take precedence if defined
+    String file = conf.get("urlnormalizer.hosts.file");
+    String stringRules = conf.get("urlnormalizer.hosts.rules");
+    if (hostsFile != null) {
+      file = hostsFile;
+    } else if (attributeFile != null) {
+      file = attributeFile;
+    }
+    Reader reader = null;
+    if (stringRules != null) { // takes precedence over files
+      reader = new StringReader(stringRules);
+    } else {
+      reader = conf.getConfResourceAsReader(file);
+    }
+    try {
+      if (reader == null) {
+        reader = new FileReader(file);
+      }
+      readConfiguration(reader);
+    } catch (IOException e) {
+      LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
+    }
+  }
+
+  public String normalize(String urlString, String scope)
+      throws MalformedURLException {
+    String host = new URL(urlString).getHost();
+
+    // Test static hosts
+    if (hostsMap.containsKey(host)) {
+      return replaceHost(urlString, host, hostsMap.get(host));
+    }
+
+    // Test for wildcard in reverse order
+    String[] hostParts = host.split("\\.");
+
+    // Use a buffer for our host parts
+    StringBuilder hostBuffer = new StringBuilder();
+
+    // This is our temp buffer keeping host parts with a wildcard
+    String wildCardHost = new String();
+
+    // Add the tld to the buffer
+    hostBuffer.append(hostParts[hostParts.length - 1]);
+
+    for (int i = hostParts.length - 2; i > 0; i--) {
+      // Prepend another sub domain
+      hostBuffer.insert(0, hostParts[i] + ".");
+
+      // Make a wildcarded sub domain
+      wildCardHost = "*." + hostBuffer.toString();
+
+      // Check if this wildcard sub domain exists
+      if (hostsMap.containsKey(wildCardHost)) {
+        // Replace the original input host with the wildard replaced
+        return replaceHost(urlString, host, hostsMap.get(wildCardHost));
+      }
+    }
+
+    return urlString;
+  }
+
+  protected String replaceHost(String urlString, String host, String target) {
+    int hostIndex = urlString.indexOf(host);
+
+    StringBuilder buffer = new StringBuilder();
+
+    buffer.append(urlString.substring(0, hostIndex));
+    buffer.append(target);
+    buffer.append(urlString.substring(hostIndex + host.length()));
+
+    return buffer.toString();
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-host/src/main/java/org/apache/nutch/net/urlnormalizer/host/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlnormalizer-host/src/main/java/org/apache/nutch/net/urlnormalizer/host/package-info.java b/nutch-plugins/urlnormalizer-host/src/main/java/org/apache/nutch/net/urlnormalizer/host/package-info.java
new file mode 100644
index 0000000..62c97d7
--- /dev/null
+++ b/nutch-plugins/urlnormalizer-host/src/main/java/org/apache/nutch/net/urlnormalizer/host/package-info.java
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * URL normalizer renaming hosts to a canonical form listed in the
+ * configuration file.
+ */
+package org.apache.nutch.net.urlnormalizer.host;
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-host/src/test/org/apache/nutch/net/urlnormalizer/host/TestHostURLNormalizer.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlnormalizer-host/src/test/org/apache/nutch/net/urlnormalizer/host/TestHostURLNormalizer.java b/nutch-plugins/urlnormalizer-host/src/test/org/apache/nutch/net/urlnormalizer/host/TestHostURLNormalizer.java
new file mode 100644
index 0000000..c9e1a2c
--- /dev/null
+++ b/nutch-plugins/urlnormalizer-host/src/test/org/apache/nutch/net/urlnormalizer/host/TestHostURLNormalizer.java
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.net.urlnormalizer.host;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.net.URLNormalizers;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Test;
+
+public class TestHostURLNormalizer {
+
+  private final static String SEPARATOR = System.getProperty("file.separator");
+  private final static String SAMPLES = System.getProperty("test.data", ".");
+
+  @Test
+  public void testHostURLNormalizer() throws Exception {
+    Configuration conf = NutchConfiguration.create();
+
+    String hostsFile = SAMPLES + SEPARATOR + "hosts.txt";
+    HostURLNormalizer normalizer = new HostURLNormalizer(hostsFile);
+    normalizer.setConf(conf);
+
+    // Force www. sub domain when hitting link without sub domain
+    Assert.assertEquals("http://www.example.org/page.html",
+        normalizer.normalize("http://example.org/page.html",
+            URLNormalizers.SCOPE_DEFAULT));
+
+    // Force no sub domain to www. URL's
+    Assert.assertEquals("http://example.net/path/to/something.html", normalizer
+        .normalize("http://www.example.net/path/to/something.html",
+            URLNormalizers.SCOPE_DEFAULT));
+
+    // Force all sub domains to www.
+    Assert.assertEquals("http://example.com/?does=it&still=work", normalizer
+        .normalize("http://example.com/?does=it&still=work",
+            URLNormalizers.SCOPE_DEFAULT));
+    Assert.assertEquals("http://example.com/buh", normalizer.normalize(
+        "http://http.www.example.com/buh", URLNormalizers.SCOPE_DEFAULT));
+    Assert.assertEquals("http://example.com/blaat", normalizer.normalize(
+        "http://whatever.example.com/blaat", URLNormalizers.SCOPE_DEFAULT));
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-pass/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlnormalizer-pass/build.xml b/nutch-plugins/urlnormalizer-pass/build.xml
new file mode 100644
index 0000000..b478e45
--- /dev/null
+++ b/nutch-plugins/urlnormalizer-pass/build.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="urlnormalizer-pass" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-pass/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlnormalizer-pass/ivy.xml b/nutch-plugins/urlnormalizer-pass/ivy.xml
new file mode 100644
index 0000000..1a86d68
--- /dev/null
+++ b/nutch-plugins/urlnormalizer-pass/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-pass/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlnormalizer-pass/plugin.xml b/nutch-plugins/urlnormalizer-pass/plugin.xml
new file mode 100644
index 0000000..31dcc70
--- /dev/null
+++ b/nutch-plugins/urlnormalizer-pass/plugin.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="urlnormalizer-pass"
+   name="Pass-through URL Normalizer"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="urlnormalizer-pass.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+   <extension id="org.apache.nutch.net.urlnormalizer.pass"
+              name="Nutch Pass-through URL Normalizer"
+              point="org.apache.nutch.net.URLNormalizer">
+      <implementation id="PassURLNormalizer"
+                      class="org.apache.nutch.net.urlnormalizer.pass.PassURLNormalizer"/>
+   </extension>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-pass/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlnormalizer-pass/pom.xml b/nutch-plugins/urlnormalizer-pass/pom.xml
new file mode 100644
index 0000000..502d0d4
--- /dev/null
+++ b/nutch-plugins/urlnormalizer-pass/pom.xml
@@ -0,0 +1,38 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>urlnormalizer-pass</artifactId>
+    <packaging>jar</packaging>
+
+    <name>urlnormalizer-pass</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-pass/src/main/java/org/apache/nutch/net/urlnormalizer/pass/PassURLNormalizer.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlnormalizer-pass/src/main/java/org/apache/nutch/net/urlnormalizer/pass/PassURLNormalizer.java b/nutch-plugins/urlnormalizer-pass/src/main/java/org/apache/nutch/net/urlnormalizer/pass/PassURLNormalizer.java
new file mode 100644
index 0000000..03d510c
--- /dev/null
+++ b/nutch-plugins/urlnormalizer-pass/src/main/java/org/apache/nutch/net/urlnormalizer/pass/PassURLNormalizer.java
@@ -0,0 +1,49 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.net.urlnormalizer.pass;
+
+import java.net.MalformedURLException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.net.URLNormalizer;
+
+/**
+ * This URLNormalizer doesn't change urls. It is sometimes useful if for a given
+ * scope at least one normalizer must be defined but no transformations are
+ * required.
+ * 
+ * @author Andrzej Bialecki
+ */
+public class PassURLNormalizer implements URLNormalizer {
+
+  private Configuration conf;
+
+  public String normalize(String urlString, String scope)
+      throws MalformedURLException {
+    return urlString;
+  }
+
+  public Configuration getConf() {
+    return conf;
+  }
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-pass/src/main/java/org/apache/nutch/net/urlnormalizer/pass/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlnormalizer-pass/src/main/java/org/apache/nutch/net/urlnormalizer/pass/package-info.java b/nutch-plugins/urlnormalizer-pass/src/main/java/org/apache/nutch/net/urlnormalizer/pass/package-info.java
new file mode 100644
index 0000000..eab6c2e
--- /dev/null
+++ b/nutch-plugins/urlnormalizer-pass/src/main/java/org/apache/nutch/net/urlnormalizer/pass/package-info.java
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * URL normalizer dummy which does not change URLs. Required because at least
+ * one URL normalizer must be defined in any scope.
+ */
+package org.apache.nutch.net.urlnormalizer.pass;
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-pass/src/test/org/apache/nutch/net/urlnormalizer/pass/TestPassURLNormalizer.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlnormalizer-pass/src/test/org/apache/nutch/net/urlnormalizer/pass/TestPassURLNormalizer.java b/nutch-plugins/urlnormalizer-pass/src/test/org/apache/nutch/net/urlnormalizer/pass/TestPassURLNormalizer.java
new file mode 100644
index 0000000..f470c62
--- /dev/null
+++ b/nutch-plugins/urlnormalizer-pass/src/test/org/apache/nutch/net/urlnormalizer/pass/TestPassURLNormalizer.java
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.net.urlnormalizer.pass;
+
+import java.net.MalformedURLException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.net.URLNormalizers;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Test;
+
+public class TestPassURLNormalizer {
+
+  @Test
+  public void testPassURLNormalizer() {
+    Configuration conf = NutchConfiguration.create();
+
+    PassURLNormalizer normalizer = new PassURLNormalizer();
+    normalizer.setConf(conf);
+    String url = "http://www.example.com/test/..//";
+    String result = null;
+    try {
+      result = normalizer.normalize(url, URLNormalizers.SCOPE_DEFAULT);
+    } catch (MalformedURLException mue) {
+      Assert.fail(mue.toString());
+    }
+
+    Assert.assertEquals(url, result);
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-protocol/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlnormalizer-protocol/build.xml b/nutch-plugins/urlnormalizer-protocol/build.xml
new file mode 100644
index 0000000..71df8e2
--- /dev/null
+++ b/nutch-plugins/urlnormalizer-protocol/build.xml
@@ -0,0 +1,27 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="urlnormalizer-protocol" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+  <!-- for junit test -->
+  <mkdir dir="${build.test}/data"/>
+  <copy todir="${build.test}/data">
+    <fileset dir="data" />
+  </copy>
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-protocol/data/protocols.txt
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlnormalizer-protocol/data/protocols.txt b/nutch-plugins/urlnormalizer-protocol/data/protocols.txt
new file mode 100644
index 0000000..7091cd7
--- /dev/null
+++ b/nutch-plugins/urlnormalizer-protocol/data/protocols.txt
@@ -0,0 +1,7 @@
+# format: host\tprotocol\n
+
+example.org	http
+example.net	http
+
+example.io	https
+example.nl	https

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-protocol/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlnormalizer-protocol/ivy.xml b/nutch-plugins/urlnormalizer-protocol/ivy.xml
new file mode 100644
index 0000000..0a363f7
--- /dev/null
+++ b/nutch-plugins/urlnormalizer-protocol/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="${nutch.root}/ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-protocol/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlnormalizer-protocol/plugin.xml b/nutch-plugins/urlnormalizer-protocol/plugin.xml
new file mode 100644
index 0000000..639b16a
--- /dev/null
+++ b/nutch-plugins/urlnormalizer-protocol/plugin.xml
@@ -0,0 +1,43 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="urlnormalizer-protocol"
+   name="Protocol URL Normalizer"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="urlnormalizer-protocol.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+   <extension id="org.apache.nutch.net.urlnormalizer.protocol"
+              name="Nutch Protocol URL Normalizer"
+              point="org.apache.nutch.net.URLNormalizer">
+      <implementation id="ProtocolURLNormalizer"
+                      class="org.apache.nutch.net.urlnormalizer.protocol.ProtocolURLNormalizer">
+        <parameter name="file" value="protocols.txt"/>
+      </implementation>
+   </extension>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-protocol/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlnormalizer-protocol/pom.xml b/nutch-plugins/urlnormalizer-protocol/pom.xml
new file mode 100644
index 0000000..7c92a2c
--- /dev/null
+++ b/nutch-plugins/urlnormalizer-protocol/pom.xml
@@ -0,0 +1,38 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>urlnormalizer-protocol</artifactId>
+    <packaging>jar</packaging>
+
+    <name>urlnormalizer-protocol</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-protocol/src/main/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlnormalizer-protocol/src/main/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java b/nutch-plugins/urlnormalizer-protocol/src/main/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java
new file mode 100644
index 0000000..4278325
--- /dev/null
+++ b/nutch-plugins/urlnormalizer-protocol/src/main/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java
@@ -0,0 +1,190 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.net.urlnormalizer.protocol;
+
+import java.io.BufferedReader;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.commons.lang.StringUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.net.URLNormalizer;
+import org.apache.nutch.plugin.Extension;
+import org.apache.nutch.plugin.PluginRepository;
+import org.apache.nutch.util.URLUtil;
+
+/**
+ * @author markus@openindex.io
+ */
+public class ProtocolURLNormalizer implements URLNormalizer {
+
+  private Configuration conf;
+
+  private static final Logger LOG = LoggerFactory.getLogger(ProtocolURLNormalizer.class);
+
+  private static final char QUESTION_MARK = '?';
+  private static final String PROTOCOL_DELIMITER = "://";
+
+  private static String attributeFile = null;
+  private String protocolsFile = null;
+  
+  // We record a map of hosts and boolean, the boolean denotes whether the host should
+  // have slashes after URL paths. True means slash, false means remove the slash
+  private static final Map<String,String> protocolsMap = new HashMap<String,String>();
+
+  public ProtocolURLNormalizer() {}
+
+  public ProtocolURLNormalizer(String protocolsFile) {
+    this.protocolsFile = protocolsFile;
+  }
+
+  private synchronized void readConfiguration(Reader configReader) throws IOException {
+    if (protocolsMap.size() > 0) {
+      return;
+    }
+
+    BufferedReader reader = new BufferedReader(configReader);
+    String line, host;
+    String protocol;
+    int delimiterIndex;
+
+    while ((line = reader.readLine()) != null) {
+      if (StringUtils.isNotBlank(line) && !line.startsWith("#")) {
+        line.trim();
+        delimiterIndex = line.indexOf(" ");
+        // try tabulator
+        if (delimiterIndex == -1) {
+          delimiterIndex = line.indexOf("\t");
+        }
+
+        host = line.substring(0, delimiterIndex);
+        protocol = line.substring(delimiterIndex + 1).trim();
+        
+        protocolsMap.put(host, protocol);
+      }
+    }
+  }
+
+  public Configuration getConf() {
+    return conf;
+  }
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+
+    // get the extensions for domain urlfilter
+    String pluginName = "urlnormalizer-protocol";
+    Extension[] extensions = PluginRepository.get(conf).getExtensionPoint(
+      URLNormalizer.class.getName()).getExtensions();
+    for (int i = 0; i < extensions.length; i++) {
+      Extension extension = extensions[i];
+      if (extension.getDescriptor().getPluginId().equals(pluginName)) {
+        attributeFile = extension.getAttribute("file");
+        break;
+      }
+    }
+
+    // handle blank non empty input
+    if (attributeFile != null && attributeFile.trim().equals("")) {
+      attributeFile = null;
+    }
+
+    if (attributeFile != null) {
+      if (LOG.isInfoEnabled()) {
+        LOG.info("Attribute \"file\" is defined for plugin " + pluginName
+          + " as " + attributeFile);
+      }
+    }
+    else {
+      if (LOG.isWarnEnabled()) {
+        LOG.warn("Attribute \"file\" is not defined in plugin.xml for plugin "
+          + pluginName);
+      }
+    }
+
+    // domain file and attribute "file" take precedence if defined
+    String file = conf.get("urlnormalizer.protocols.file");
+    String stringRules = conf.get("urlnormalizer.protocols.rules");
+    if (protocolsFile != null) {
+      file = protocolsFile;
+    }
+    else if (attributeFile != null) {
+      file = attributeFile;
+    }
+    Reader reader = null;
+    if (stringRules != null) { // takes precedence over files
+      reader = new StringReader(stringRules);
+    } else {
+      reader = conf.getConfResourceAsReader(file);
+    }
+    try {
+      if (reader == null) {
+        reader = new FileReader(file);
+      }
+      readConfiguration(reader);
+    }
+    catch (IOException e) {
+      LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
+    }
+  }
+  
+  public String normalize(String url, String scope) throws MalformedURLException {
+    return normalize(url, null, scope);
+  }
+
+  public String normalize(String url, CrawlDatum crawlDatum, String scope) throws MalformedURLException {
+    // Get URL repr.
+    URL u = new URL(url);
+    
+    // Get the host
+    String host = u.getHost();
+
+    // Do we have a rule for this host?
+    if (protocolsMap.containsKey(host)) {    
+      String protocol = u.getProtocol();
+      String requiredProtocol = protocolsMap.get(host);
+      
+      // Incorrect protocol?
+      if (!protocol.equals(requiredProtocol)) {
+        // Rebuild URL with new protocol
+        StringBuilder buffer = new StringBuilder(requiredProtocol);
+        buffer.append(PROTOCOL_DELIMITER);
+        buffer.append(host);
+        buffer.append(u.getPath());
+        
+        String queryString = u.getQuery();
+        if (queryString != null) {
+          buffer.append(QUESTION_MARK);
+          buffer.append(queryString);
+        }
+        
+        url = buffer.toString();
+      }
+    }
+
+    return url;
+  }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-protocol/src/test/org/apache/nutch/net/urlnormalizer/protocol/TestProtocolURLNormalizer.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlnormalizer-protocol/src/test/org/apache/nutch/net/urlnormalizer/protocol/TestProtocolURLNormalizer.java b/nutch-plugins/urlnormalizer-protocol/src/test/org/apache/nutch/net/urlnormalizer/protocol/TestProtocolURLNormalizer.java
new file mode 100644
index 0000000..8880628
--- /dev/null
+++ b/nutch-plugins/urlnormalizer-protocol/src/test/org/apache/nutch/net/urlnormalizer/protocol/TestProtocolURLNormalizer.java
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.net.urlnormalizer.protocol;
+
+import java.net.MalformedURLException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.net.URLNormalizers;
+import org.apache.nutch.util.NutchConfiguration;
+
+import junit.framework.TestCase;
+
+public class TestProtocolURLNormalizer extends TestCase {
+
+  private final static String SEPARATOR = System.getProperty("file.separator");
+  private final static String SAMPLES = System.getProperty("test.data", ".");
+
+  public void testProtocolURLNormalizer() throws Exception {
+    Configuration conf = NutchConfiguration.create();
+
+    String protocolsFile = SAMPLES + SEPARATOR + "protocols.txt";
+    ProtocolURLNormalizer normalizer = new ProtocolURLNormalizer(protocolsFile);
+    normalizer.setConf(conf);
+
+    // No change
+    assertEquals("http://example.org/", normalizer.normalize("https://example.org/", URLNormalizers.SCOPE_DEFAULT));
+    assertEquals("http://example.net/", normalizer.normalize("https://example.net/", URLNormalizers.SCOPE_DEFAULT));
+    
+    // https to http
+    assertEquals("http://example.org/", normalizer.normalize("https://example.org/", URLNormalizers.SCOPE_DEFAULT));
+    assertEquals("http://example.net/", normalizer.normalize("https://example.net/", URLNormalizers.SCOPE_DEFAULT));
+    
+    // no change
+    assertEquals("https://example.io/", normalizer.normalize("https://example.io/", URLNormalizers.SCOPE_DEFAULT));
+    assertEquals("https://example.nl/", normalizer.normalize("https://example.nl/", URLNormalizers.SCOPE_DEFAULT));
+    
+    // http to https
+    assertEquals("https://example.io/", normalizer.normalize("http://example.io/", URLNormalizers.SCOPE_DEFAULT));
+    assertEquals("https://example.nl/", normalizer.normalize("http://example.nl/", URLNormalizers.SCOPE_DEFAULT));
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-querystring/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlnormalizer-querystring/build.xml b/nutch-plugins/urlnormalizer-querystring/build.xml
new file mode 100644
index 0000000..2d692c4
--- /dev/null
+++ b/nutch-plugins/urlnormalizer-querystring/build.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="urlnormalizer-querystring" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-querystring/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlnormalizer-querystring/ivy.xml b/nutch-plugins/urlnormalizer-querystring/ivy.xml
new file mode 100644
index 0000000..0a363f7
--- /dev/null
+++ b/nutch-plugins/urlnormalizer-querystring/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="${nutch.root}/ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-querystring/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlnormalizer-querystring/plugin.xml b/nutch-plugins/urlnormalizer-querystring/plugin.xml
new file mode 100644
index 0000000..2a677fc
--- /dev/null
+++ b/nutch-plugins/urlnormalizer-querystring/plugin.xml
@@ -0,0 +1,42 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="urlnormalizer-querystring"
+   name="Querystrings URL Normalizer"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="urlnormalizer-querystring.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+   <extension id="org.apache.nutch.net.urlnormalizer.querystring"
+              name="Nutch Querystring URL Normalizer"
+              point="org.apache.nutch.net.URLNormalizer">
+      <implementation id="QuerystringURLNormalizer"
+                      class="org.apache.nutch.net.urlnormalizer.querystring.QuerystringURLNormalizer">
+      </implementation>
+   </extension>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-querystring/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlnormalizer-querystring/pom.xml b/nutch-plugins/urlnormalizer-querystring/pom.xml
new file mode 100644
index 0000000..514f2f0
--- /dev/null
+++ b/nutch-plugins/urlnormalizer-querystring/pom.xml
@@ -0,0 +1,38 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>urlnormalizer-querystring</artifactId>
+    <packaging>jar</packaging>
+
+    <name>urlnormalizer-querystring</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+</project>