You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2014/03/27 18:18:56 UTC

svn commit: r1582410 - in /nutch/branches/2.x: ./ conf/ src/plugin/ src/plugin/urlfilter-validator/src/java/org/apache/nutch/urlfilter/validator/ src/plugin/urlfilter-validator/src/test/ src/plugin/urlfilter-validator/src/test/org/ src/plugin/urlfilter...

Author: lewismc
Date: Thu Mar 27 17:18:55 2014
New Revision: 1582410

URL: http://svn.apache.org/r1582410
Log:
NUTCH-1727 Configurable length for Tlds

Added:
    nutch/branches/2.x/src/plugin/urlfilter-validator/src/test/
    nutch/branches/2.x/src/plugin/urlfilter-validator/src/test/org/
    nutch/branches/2.x/src/plugin/urlfilter-validator/src/test/org/apache/
    nutch/branches/2.x/src/plugin/urlfilter-validator/src/test/org/apache/nutch/
    nutch/branches/2.x/src/plugin/urlfilter-validator/src/test/org/apache/nutch/urlfilter/
    nutch/branches/2.x/src/plugin/urlfilter-validator/src/test/org/apache/nutch/urlfilter/validator/
    nutch/branches/2.x/src/plugin/urlfilter-validator/src/test/org/apache/nutch/urlfilter/validator/TestUrlValidator.java
Modified:
    nutch/branches/2.x/CHANGES.txt
    nutch/branches/2.x/build.xml
    nutch/branches/2.x/conf/nutch-default.xml
    nutch/branches/2.x/src/plugin/build.xml
    nutch/branches/2.x/src/plugin/urlfilter-validator/src/java/org/apache/nutch/urlfilter/validator/UrlValidator.java

Modified: nutch/branches/2.x/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1582410&r1=1582409&r2=1582410&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Thu Mar 27 17:18:55 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Current Development
 
+* NUTCH-1727 Configurable length for Tlds (Sertac TURKEL via lewismc)
+
 * NUTCH-1738 Expose number of URLs generated per batch in GeneratorJob (Talat UYARER via lewismc)
 
 * NUTCH-1671 indexchecker to add digest field (snagel, lufeng)

Modified: nutch/branches/2.x/build.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/build.xml?rev=1582410&r1=1582409&r2=1582410&view=diff
==============================================================================
--- nutch/branches/2.x/build.xml (original)
+++ nutch/branches/2.x/build.xml Thu Mar 27 17:18:55 2014
@@ -989,6 +989,7 @@
         <source path="${basedir}/src/plugin/urlfilter-suffix/src/java/" />
         <source path="${basedir}/src/plugin/urlfilter-suffix/src/test/" />
         <source path="${basedir}/src/plugin/urlfilter-validator/src/java/" />
+      	 <source path="${basedir}/src/plugin/urlfilter-validator/src/test/" />
         <source path="${basedir}/src/plugin/urlnormalizer-basic/src/java/" />
         <source path="${basedir}/src/plugin/urlnormalizer-basic/src/test/" />
         <source path="${basedir}/src/plugin/urlnormalizer-pass/src/java/" />

Modified: nutch/branches/2.x/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/conf/nutch-default.xml?rev=1582410&r1=1582409&r2=1582410&view=diff
==============================================================================
--- nutch/branches/2.x/conf/nutch-default.xml (original)
+++ nutch/branches/2.x/conf/nutch-default.xml Thu Mar 27 17:18:55 2014
@@ -973,6 +973,12 @@
 <!-- urlfilter plugin properties -->
 
 <property>
+  <name>urlfilter.tld.length</name>
+  <value></value>
+  <description>Maximum Character length of top-level-domain</description>
+</property>
+
+<property>
   <name>urlfilter.domain.file</name>
   <value>domain-urlfilter.txt</value>
   <description>Name of file on CLASSPATH containing either top level domains or

Modified: nutch/branches/2.x/src/plugin/build.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/build.xml?rev=1582410&r1=1582409&r2=1582410&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/build.xml (original)
+++ nutch/branches/2.x/src/plugin/build.xml Thu Mar 27 17:18:55 2014
@@ -91,6 +91,7 @@
      <ant dir="urlfilter-prefix" target="test"/>
      <ant dir="urlfilter-regex" target="test"/>
      <ant dir="urlfilter-suffix" target="test"/>
+     <ant dir="urlfilter-validator" target="test"/>
      <ant dir="urlnormalizer-basic" target="test"/>
      <ant dir="urlnormalizer-pass" target="test"/>
      <ant dir="urlnormalizer-regex" target="test"/>

Modified: nutch/branches/2.x/src/plugin/urlfilter-validator/src/java/org/apache/nutch/urlfilter/validator/UrlValidator.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/urlfilter-validator/src/java/org/apache/nutch/urlfilter/validator/UrlValidator.java?rev=1582410&r1=1582409&r2=1582410&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/urlfilter-validator/src/java/org/apache/nutch/urlfilter/validator/UrlValidator.java (original)
+++ nutch/branches/2.x/src/plugin/urlfilter-validator/src/java/org/apache/nutch/urlfilter/validator/UrlValidator.java Thu Mar 27 17:18:55 2014
@@ -23,12 +23,16 @@ import org.apache.hadoop.conf.Configurat
 import org.apache.nutch.net.URLFilter;
 
 /**
- * <p>Validates URLs.</p>
- *
- * <p>Originally based in on php script by Debbie Dyer, validation.php v1.2b,
- * Date: 03/07/02,
- * http://javascript.internet.com. However, this validation now bears little
- * resemblance to the php original.</p>
+ * <p>
+ * Validates URLs.
+ * </p>
+ * 
+ * <p>
+ * Originally based in on php script by Debbie Dyer, validation.php v1.2b, Date:
+ * 03/07/02, http://javascript.internet.com. However, this validation now bears
+ * little resemblance to the php original.
+ * </p>
+ * 
  * <pre>
  *   Example of usage:
  *    UrlValidator urlValidator = UrlValidator.get();
@@ -37,17 +41,17 @@ import org.apache.nutch.net.URLFilter;
  *    } else {
  *       System.out.println("url is invalid");
  *    }
- *
+ * 
  *   prints out "url is valid"
- *  </pre>
- *
- * <p>Based on UrlValidator code from Apache commons-validator.</p>
- *
- * @see
- * <a href='http://www.ietf.org/rfc/rfc2396.txt' >
- *  Uniform Resource Identifiers (URI): Generic Syntax
- * </a>
- *
+ * </pre>
+ * 
+ * <p>
+ * Based on UrlValidator code from Apache commons-validator.
+ * </p>
+ * 
+ * @see <a href='http://www.ietf.org/rfc/rfc2396.txt' > Uniform Resource
+ *      Identifiers (URI): Generic Syntax </a>
+ * 
  */
 public class UrlValidator implements URLFilter {
 
@@ -61,7 +65,7 @@ public class UrlValidator implements URL
 
   private static final String SCHEME_CHARS = ALPHA_CHARS;
 
-  // Drop numeric, and  "+-." for now
+  // Drop numeric, and "+-." for now
   private static final String AUTHORITY_CHARS = ALPHA_NUMERIC_CHARS + "\\-\\.";
 
   private static final String ATOM = VALID_CHARS + '+';
@@ -69,9 +73,9 @@ public class UrlValidator implements URL
   /**
    * This expression derived/taken from the BNF for URI (RFC2396).
    */
-  private static final Pattern URL_PATTERN =
-    Pattern.compile("^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)" +
-                    "(\\?([^#]*))?(#(.*))?");
+  private static final Pattern URL_PATTERN = Pattern
+      .compile("^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)"
+          + "(\\?([^#]*))?(#(.*))?");
 
   /**
    * Schema/Protocol (ie. http:, ftp:, file:, etc).
@@ -90,11 +94,11 @@ public class UrlValidator implements URL
   /**
    * Protocol (ie. http:, ftp:,https:).
    */
-  private static final Pattern SCHEME_PATTERN =
-    Pattern.compile("^[" + SCHEME_CHARS + "]+");
+  private static final Pattern SCHEME_PATTERN = Pattern.compile("^["
+      + SCHEME_CHARS + "]+");
 
-  private static final Pattern AUTHORITY_PATTERN =
-    Pattern.compile("^([" + AUTHORITY_CHARS + "]*)(:\\d*)?(.*)?");
+  private static final Pattern AUTHORITY_PATTERN = Pattern.compile("^(["
+      + AUTHORITY_CHARS + "]*)(:\\d*)?(.*)?");
 
   private static final int PARSE_AUTHORITY_HOST_IP = 1;
 
@@ -105,31 +109,36 @@ public class UrlValidator implements URL
    */
   private static final int PARSE_AUTHORITY_EXTRA = 3;
 
-  private static final Pattern PATH_PATTERN =
-    Pattern.compile("^(/[-\\w:@&?=+,.!/~*'%$_;\\(\\)]*)?$");
+  private static final Pattern PATH_PATTERN = Pattern
+      .compile("^(/[-\\w:@&?=+,.!/~*'%$_;\\(\\)]*)?$");
 
   private static final Pattern QUERY_PATTERN = Pattern.compile("^(.*)$");
 
-  private static final Pattern LEGAL_ASCII_PATTERN =
-    Pattern.compile("^[\\x21-\\x7E]+$");
+  private static final Pattern LEGAL_ASCII_PATTERN = Pattern
+      .compile("^[\\x21-\\x7E]+$");
 
-  private static final Pattern IP_V4_DOMAIN_PATTERN =
-    Pattern.compile("^(\\d{1,3})[.](\\d{1,3})[.](\\d{1,3})[.](\\d{1,3})$");
+  private static final Pattern IP_V4_DOMAIN_PATTERN = Pattern
+      .compile("^(\\d{1,3})[.](\\d{1,3})[.](\\d{1,3})[.](\\d{1,3})$");
 
-  private static final Pattern DOMAIN_PATTERN =
-    Pattern.compile("^" + ATOM + "(\\." + ATOM + ")*$");
+  private static final Pattern DOMAIN_PATTERN = Pattern.compile("^" + ATOM
+      + "(\\." + ATOM + ")*$");
 
-  private static final Pattern PORT_PATTERN =
-    Pattern.compile("^:(\\d{1,5})$");
+  private static final Pattern PORT_PATTERN = Pattern.compile("^:(\\d{1,5})$");
 
-  private static final Pattern ATOM_PATTERN =
-    Pattern.compile("(" + ATOM + ")");
+  private static final Pattern ATOM_PATTERN = Pattern.compile("(" + ATOM + ")");
 
-  private static final Pattern ALPHA_PATTERN =
-    Pattern.compile("^[" + ALPHA_CHARS + "]");
+  private static final Pattern ALPHA_PATTERN = Pattern.compile("^["
+      + ALPHA_CHARS + "]");
 
   private Configuration conf;
 
+  private int maxTldLength;
+
+  private static String TOP_LEVEL_DOMAIN_LENGTH = "urlfilter.tld.length"; // maximum length of TLD
+                                                                          
+
+  private static final int TOP_LEVEL_DOMAIN_LENGTH_VALUE = 8;
+
   public String filter(String urlString) {
     return isValid(urlString) ? urlString : null;
   }
@@ -140,13 +149,19 @@ public class UrlValidator implements URL
 
   public void setConf(Configuration conf) {
     this.conf = conf;
+    maxTldLength = conf.getInt(TOP_LEVEL_DOMAIN_LENGTH, 8);
+    if (maxTldLength <= 2)
+      maxTldLength = TOP_LEVEL_DOMAIN_LENGTH_VALUE;
   }
 
   /**
-   * <p>Checks if a field has a valid url address.</p>
-   *
-   * @param value The value validation is being performed on.
-   * A <code>null</code> value is considered invalid.
+   * <p>
+   * Checks if a field has a valid url address.
+   * </p>
+   * 
+   * @param value
+   *          The value validation is being performed on. A <code>null</code>
+   *          value is considered invalid.
    * @return true if the url is valid.
    */
   private boolean isValid(String value) {
@@ -184,11 +199,13 @@ public class UrlValidator implements URL
   }
 
   /**
-   * Validate scheme. If schemes[] was initialized to a non null,
-   * then only those scheme's are allowed.  Note this is slightly different
-   * than for the constructor.
-   * @param scheme The scheme to validate.  A <code>null</code> value is
-   * considered invalid.
+   * Validate scheme. If schemes[] was initialized to a non null, then only
+   * those scheme's are allowed. Note this is slightly different than for the
+   * constructor.
+   * 
+   * @param scheme
+   *          The scheme to validate. A <code>null</code> value is considered
+   *          invalid.
    * @return true if valid.
    */
   private boolean isValidScheme(String scheme) {
@@ -200,10 +217,12 @@ public class UrlValidator implements URL
   }
 
   /**
-   * Returns true if the authority is properly formatted.  An authority is
-   * the combination of hostname and port.  A <code>null</code> authority
-   * value is considered invalid.
-   * @param authority Authority value to validate.
+   * Returns true if the authority is properly formatted. An authority is the
+   * combination of hostname and port. A <code>null</code> authority value is
+   * considered invalid.
+   * 
+   * @param authority
+   *          Authority value to validate.
    * @return true if authority (hostname and port) is valid.
    */
   private boolean isValidAuthority(String authority) {
@@ -235,7 +254,7 @@ public class UrlValidator implements URL
           if (Integer.parseInt(ipSegment) > 255) {
             return false;
           }
-        } catch(NumberFormatException e) {
+        } catch (NumberFormatException e) {
           return false;
         }
 
@@ -251,8 +270,8 @@ public class UrlValidator implements URL
       // TODO: Rewrite to use ArrayList and .add semantics: see VALIDATOR-203
       char[] chars = hostIP.toCharArray();
       int size = 1;
-      for(int i=0; i<chars.length; i++) {
-        if(chars[i] == '.') {
+      for (int i = 0; i < chars.length; i++) {
+        if (chars[i] == '.') {
           size++;
         }
       }
@@ -264,12 +283,12 @@ public class UrlValidator implements URL
       while (atomMatcher.find()) {
         domainSegment[segCount] = atomMatcher.group();
         segLen = domainSegment[segCount].length() + 1;
-        hostIP = (segLen >= hostIP.length()) ? ""
-                                             : hostIP.substring(segLen);
+        hostIP = (segLen >= hostIP.length()) ? "" : hostIP.substring(segLen);
         segCount++;
       }
       String topLevel = domainSegment[segCount - 1];
-      if (topLevel.length() < 2 || topLevel.length() > 4) {
+      if (topLevel.length() < 2
+          || topLevel.length() > maxTldLength) {
         return false;
       }
 
@@ -300,10 +319,13 @@ public class UrlValidator implements URL
   }
 
   /**
-   * <p>Checks if the field isn't null and length of the field is greater
-   * than zero not including whitespace.</p>
-   *
-   * @param value The value validation is being performed on.
+   * <p>
+   * Checks if the field isn't null and length of the field is greater than zero
+   * not including whitespace.
+   * </p>
+   * 
+   * @param value
+   *          The value validation is being performed on.
    * @return true if blank or null.
    */
   private boolean isBlankOrNull(String value) {
@@ -311,9 +333,11 @@ public class UrlValidator implements URL
   }
 
   /**
-   * Returns true if the path is valid.  A <code>null</code> value is
-   * considered invalid.
-   * @param path Path value to validate.
+   * Returns true if the path is valid. A <code>null</code> value is considered
+   * invalid.
+   * 
+   * @param path
+   *          Path value to validate.
    * @return true if path is valid.
    */
   private boolean isValidPath(String path) {
@@ -335,7 +359,9 @@ public class UrlValidator implements URL
   /**
    * Returns true if the query is null or it's a properly formatted query
    * string.
-   * @param query Query value to validate.
+   * 
+   * @param query
+   *          Query value to validate.
    * @return true if query is valid.
    */
   private boolean isValidQuery(String query) {
@@ -348,8 +374,11 @@ public class UrlValidator implements URL
 
   /**
    * Returns the number of times the token appears in the target.
-   * @param token Token value to be counted.
-   * @param target Target value to count tokens in.
+   * 
+   * @param token
+   *          Token value to be counted.
+   * @param target
+   *          Target value to count tokens in.
    * @return the number of tokens.
    */
   private int countToken(String token, String target) {

Added: nutch/branches/2.x/src/plugin/urlfilter-validator/src/test/org/apache/nutch/urlfilter/validator/TestUrlValidator.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/urlfilter-validator/src/test/org/apache/nutch/urlfilter/validator/TestUrlValidator.java?rev=1582410&view=auto
==============================================================================
--- nutch/branches/2.x/src/plugin/urlfilter-validator/src/test/org/apache/nutch/urlfilter/validator/TestUrlValidator.java (added)
+++ nutch/branches/2.x/src/plugin/urlfilter-validator/src/test/org/apache/nutch/urlfilter/validator/TestUrlValidator.java Thu Mar 27 17:18:55 2014
@@ -0,0 +1,131 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.validator;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.urlfilter.validator.UrlValidator;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Before;
+import org.junit.Test;
+
+import junit.framework.TestCase;
+
+/**
+ * JUnit test case which tests 1. that valid urls are not filtered while invalid
+ * ones are filtered. 2. that Urls' scheme, authority, path and query are
+ * validated. Also checks valid length of tld.
+ * 
+ * 
+ */
+
+public class TestUrlValidator extends TestCase {
+  private Configuration conf;
+  private static int tldLength;
+  private String validUrl;
+  private String invalidUrl;
+  private String preUrl = "http://example.";
+
+  @Before
+  public void setUp() throws Exception {
+    super.setUp();
+    conf = NutchConfiguration.create();
+    tldLength = conf.getInt("urlfilter.tld.length", 8);
+  }
+
+  /**
+   * Test method for
+   * {@link org.apache.nutch.urlfilter.validator.UrlValidator#filter(java.lang.String)}
+   * .
+   */
+  @Test
+  public void testFilter() {
+    UrlValidator url_validator = new UrlValidator();
+    url_validator.setConf(conf);
+
+    validUrl = generateValidTld(tldLength);
+    invalidUrl = generateInvalidTld(tldLength);
+
+    assertNotNull(url_validator);
+
+    // invalid urls
+    assertNull("Filtering on a null object should return null",
+        url_validator.filter(null));
+    assertNull("Invalid url: example.com/file[/].html",
+        url_validator.filter("example.com/file[/].html"));
+    assertNull("Invalid url: http://www.example.com/space here.html",
+        url_validator.filter("http://www.example.com/space here.html"));
+    assertNull("Invalid url: /main.html", url_validator.filter("/main.html"));
+    assertNull("Invalid url: www.example.com/main.html",
+        url_validator.filter("www.example.com/main.html"));
+    assertNull("Invalid url: ftp:www.example.com/main.html",
+        url_validator.filter("ftp:www.example.com/main.html"));
+    assertNull("Inalid url: http://999.000.456.32/nutch/trunk/README.txt",
+        url_validator.filter("http://999.000.456.32/nutch/trunk/README.txt"));
+    assertNull("Invalid url: http://www.example.com/ma|in\\toc.html",
+        url_validator.filter(" http://www.example.com/ma|in\\toc.html"));
+    // test tld limit
+    assertNull("InValid url: " + invalidUrl, url_validator.filter(invalidUrl));
+
+    // valid urls
+    assertNotNull("Valid url: https://issues.apache.org/jira/NUTCH-1127",
+        url_validator.filter("https://issues.apache.org/jira/NUTCH-1127"));
+    assertNotNull(
+        "Valid url: http://domain.tld/function.cgi?url=http://fonzi.com/&amp;name=Fonzi&amp;mood=happy&amp;coat=leather",
+        url_validator
+            .filter("http://domain.tld/function.cgi?url=http://fonzi.com/&amp;name=Fonzi&amp;mood=happy&amp;coat=leather"));
+    assertNotNull(
+        "Valid url: http://validator.w3.org/feed/check.cgi?url=http%3A%2F%2Ffeeds.feedburner.com%2Fperishablepress",
+        url_validator
+            .filter("http://validator.w3.org/feed/check.cgi?url=http%3A%2F%2Ffeeds.feedburner.com%2Fperishablepress"));
+    assertNotNull("Valid url: ftp://alfa.bravo.pi/foo/bar/plan.pdf",
+        url_validator.filter("ftp://alfa.bravo.pi/mike/check/plan.pdf"));
+    // test tld limit
+    assertNotNull("Valid url: " + validUrl, url_validator.filter(validUrl));
+
+  }
+
+  /**
+   * Generate Sample of Valid Tld.
+   */
+  public String generateValidTld(int length) {
+    StringBuffer buffer = new StringBuffer();
+    for (int i = 1; i <= length; i++) {
+
+      char c = (char) ('a' + Math.random() * 26);
+      buffer.append(c);
+    }
+    String tempValidUrl = preUrl + buffer.toString();
+    return tempValidUrl;
+  }
+
+  /**
+   * Generate Sample of Invalid Tld. 
+   * character
+   */
+  public String generateInvalidTld(int length) {
+
+    StringBuffer buffer = new StringBuffer();
+    for (int i = 1; i <= length + 1; i++) {
+
+      char c = (char) ('a' + Math.random() * 26);
+      buffer.append(c);
+    }
+    String tempInvalidUrl = preUrl + buffer.toString();
+    return tempInvalidUrl;
+
+  }
+}