You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2014/03/27 18:18:56 UTC
svn commit: r1582410 - in /nutch/branches/2.x: ./ conf/ src/plugin/
src/plugin/urlfilter-validator/src/java/org/apache/nutch/urlfilter/validator/
src/plugin/urlfilter-validator/src/test/
src/plugin/urlfilter-validator/src/test/org/ src/plugin/urlfilter...
Author: lewismc
Date: Thu Mar 27 17:18:55 2014
New Revision: 1582410
URL: http://svn.apache.org/r1582410
Log:
NUTCH-1727 Configurable length for Tlds
Added:
nutch/branches/2.x/src/plugin/urlfilter-validator/src/test/
nutch/branches/2.x/src/plugin/urlfilter-validator/src/test/org/
nutch/branches/2.x/src/plugin/urlfilter-validator/src/test/org/apache/
nutch/branches/2.x/src/plugin/urlfilter-validator/src/test/org/apache/nutch/
nutch/branches/2.x/src/plugin/urlfilter-validator/src/test/org/apache/nutch/urlfilter/
nutch/branches/2.x/src/plugin/urlfilter-validator/src/test/org/apache/nutch/urlfilter/validator/
nutch/branches/2.x/src/plugin/urlfilter-validator/src/test/org/apache/nutch/urlfilter/validator/TestUrlValidator.java
Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/build.xml
nutch/branches/2.x/conf/nutch-default.xml
nutch/branches/2.x/src/plugin/build.xml
nutch/branches/2.x/src/plugin/urlfilter-validator/src/java/org/apache/nutch/urlfilter/validator/UrlValidator.java
Modified: nutch/branches/2.x/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1582410&r1=1582409&r2=1582410&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Thu Mar 27 17:18:55 2014
@@ -2,6 +2,8 @@ Nutch Change Log
Current Development
+* NUTCH-1727 Configurable length for Tlds (Sertac TURKEL via lewismc)
+
* NUTCH-1738 Expose number of URLs generated per batch in GeneratorJob (Talat UYARER via lewismc)
* NUTCH-1671 indexchecker to add digest field (snagel, lufeng)
Modified: nutch/branches/2.x/build.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/build.xml?rev=1582410&r1=1582409&r2=1582410&view=diff
==============================================================================
--- nutch/branches/2.x/build.xml (original)
+++ nutch/branches/2.x/build.xml Thu Mar 27 17:18:55 2014
@@ -989,6 +989,7 @@
<source path="${basedir}/src/plugin/urlfilter-suffix/src/java/" />
<source path="${basedir}/src/plugin/urlfilter-suffix/src/test/" />
<source path="${basedir}/src/plugin/urlfilter-validator/src/java/" />
+ <source path="${basedir}/src/plugin/urlfilter-validator/src/test/" />
<source path="${basedir}/src/plugin/urlnormalizer-basic/src/java/" />
<source path="${basedir}/src/plugin/urlnormalizer-basic/src/test/" />
<source path="${basedir}/src/plugin/urlnormalizer-pass/src/java/" />
Modified: nutch/branches/2.x/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/conf/nutch-default.xml?rev=1582410&r1=1582409&r2=1582410&view=diff
==============================================================================
--- nutch/branches/2.x/conf/nutch-default.xml (original)
+++ nutch/branches/2.x/conf/nutch-default.xml Thu Mar 27 17:18:55 2014
@@ -973,6 +973,12 @@
<!-- urlfilter plugin properties -->
<property>
+ <name>urlfilter.tld.length</name>
+ <value></value>
+ <description>Maximum Character length of top-level-domain</description>
+</property>
+
+<property>
<name>urlfilter.domain.file</name>
<value>domain-urlfilter.txt</value>
<description>Name of file on CLASSPATH containing either top level domains or
Modified: nutch/branches/2.x/src/plugin/build.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/build.xml?rev=1582410&r1=1582409&r2=1582410&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/build.xml (original)
+++ nutch/branches/2.x/src/plugin/build.xml Thu Mar 27 17:18:55 2014
@@ -91,6 +91,7 @@
<ant dir="urlfilter-prefix" target="test"/>
<ant dir="urlfilter-regex" target="test"/>
<ant dir="urlfilter-suffix" target="test"/>
+ <ant dir="urlfilter-validator" target="test"/>
<ant dir="urlnormalizer-basic" target="test"/>
<ant dir="urlnormalizer-pass" target="test"/>
<ant dir="urlnormalizer-regex" target="test"/>
Modified: nutch/branches/2.x/src/plugin/urlfilter-validator/src/java/org/apache/nutch/urlfilter/validator/UrlValidator.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/urlfilter-validator/src/java/org/apache/nutch/urlfilter/validator/UrlValidator.java?rev=1582410&r1=1582409&r2=1582410&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/urlfilter-validator/src/java/org/apache/nutch/urlfilter/validator/UrlValidator.java (original)
+++ nutch/branches/2.x/src/plugin/urlfilter-validator/src/java/org/apache/nutch/urlfilter/validator/UrlValidator.java Thu Mar 27 17:18:55 2014
@@ -23,12 +23,16 @@ import org.apache.hadoop.conf.Configurat
import org.apache.nutch.net.URLFilter;
/**
- * <p>Validates URLs.</p>
- *
- * <p>Originally based in on php script by Debbie Dyer, validation.php v1.2b,
- * Date: 03/07/02,
- * http://javascript.internet.com. However, this validation now bears little
- * resemblance to the php original.</p>
+ * <p>
+ * Validates URLs.
+ * </p>
+ *
+ * <p>
+ * Originally based in on php script by Debbie Dyer, validation.php v1.2b, Date:
+ * 03/07/02, http://javascript.internet.com. However, this validation now bears
+ * little resemblance to the php original.
+ * </p>
+ *
* <pre>
* Example of usage:
* UrlValidator urlValidator = UrlValidator.get();
@@ -37,17 +41,17 @@ import org.apache.nutch.net.URLFilter;
* } else {
* System.out.println("url is invalid");
* }
- *
+ *
* prints out "url is valid"
- * </pre>
- *
- * <p>Based on UrlValidator code from Apache commons-validator.</p>
- *
- * @see
- * <a href='http://www.ietf.org/rfc/rfc2396.txt' >
- * Uniform Resource Identifiers (URI): Generic Syntax
- * </a>
- *
+ * </pre>
+ *
+ * <p>
+ * Based on UrlValidator code from Apache commons-validator.
+ * </p>
+ *
+ * @see <a href='http://www.ietf.org/rfc/rfc2396.txt' > Uniform Resource
+ * Identifiers (URI): Generic Syntax </a>
+ *
*/
public class UrlValidator implements URLFilter {
@@ -61,7 +65,7 @@ public class UrlValidator implements URL
private static final String SCHEME_CHARS = ALPHA_CHARS;
- // Drop numeric, and "+-." for now
+ // Drop numeric, and "+-." for now
private static final String AUTHORITY_CHARS = ALPHA_NUMERIC_CHARS + "\\-\\.";
private static final String ATOM = VALID_CHARS + '+';
@@ -69,9 +73,9 @@ public class UrlValidator implements URL
/**
* This expression derived/taken from the BNF for URI (RFC2396).
*/
- private static final Pattern URL_PATTERN =
- Pattern.compile("^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)" +
- "(\\?([^#]*))?(#(.*))?");
+ private static final Pattern URL_PATTERN = Pattern
+ .compile("^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)"
+ + "(\\?([^#]*))?(#(.*))?");
/**
* Schema/Protocol (ie. http:, ftp:, file:, etc).
@@ -90,11 +94,11 @@ public class UrlValidator implements URL
/**
* Protocol (ie. http:, ftp:,https:).
*/
- private static final Pattern SCHEME_PATTERN =
- Pattern.compile("^[" + SCHEME_CHARS + "]+");
+ private static final Pattern SCHEME_PATTERN = Pattern.compile("^["
+ + SCHEME_CHARS + "]+");
- private static final Pattern AUTHORITY_PATTERN =
- Pattern.compile("^([" + AUTHORITY_CHARS + "]*)(:\\d*)?(.*)?");
+ private static final Pattern AUTHORITY_PATTERN = Pattern.compile("^(["
+ + AUTHORITY_CHARS + "]*)(:\\d*)?(.*)?");
private static final int PARSE_AUTHORITY_HOST_IP = 1;
@@ -105,31 +109,36 @@ public class UrlValidator implements URL
*/
private static final int PARSE_AUTHORITY_EXTRA = 3;
- private static final Pattern PATH_PATTERN =
- Pattern.compile("^(/[-\\w:@&?=+,.!/~*'%$_;\\(\\)]*)?$");
+ private static final Pattern PATH_PATTERN = Pattern
+ .compile("^(/[-\\w:@&?=+,.!/~*'%$_;\\(\\)]*)?$");
private static final Pattern QUERY_PATTERN = Pattern.compile("^(.*)$");
- private static final Pattern LEGAL_ASCII_PATTERN =
- Pattern.compile("^[\\x21-\\x7E]+$");
+ private static final Pattern LEGAL_ASCII_PATTERN = Pattern
+ .compile("^[\\x21-\\x7E]+$");
- private static final Pattern IP_V4_DOMAIN_PATTERN =
- Pattern.compile("^(\\d{1,3})[.](\\d{1,3})[.](\\d{1,3})[.](\\d{1,3})$");
+ private static final Pattern IP_V4_DOMAIN_PATTERN = Pattern
+ .compile("^(\\d{1,3})[.](\\d{1,3})[.](\\d{1,3})[.](\\d{1,3})$");
- private static final Pattern DOMAIN_PATTERN =
- Pattern.compile("^" + ATOM + "(\\." + ATOM + ")*$");
+ private static final Pattern DOMAIN_PATTERN = Pattern.compile("^" + ATOM
+ + "(\\." + ATOM + ")*$");
- private static final Pattern PORT_PATTERN =
- Pattern.compile("^:(\\d{1,5})$");
+ private static final Pattern PORT_PATTERN = Pattern.compile("^:(\\d{1,5})$");
- private static final Pattern ATOM_PATTERN =
- Pattern.compile("(" + ATOM + ")");
+ private static final Pattern ATOM_PATTERN = Pattern.compile("(" + ATOM + ")");
- private static final Pattern ALPHA_PATTERN =
- Pattern.compile("^[" + ALPHA_CHARS + "]");
+ private static final Pattern ALPHA_PATTERN = Pattern.compile("^["
+ + ALPHA_CHARS + "]");
private Configuration conf;
+ private int maxTldLength;
+
+ private static String TOP_LEVEL_DOMAIN_LENGTH = "urlfilter.tld.length"; // maximum length of TLD
+
+
+ private static final int TOP_LEVEL_DOMAIN_LENGTH_VALUE = 8;
+
public String filter(String urlString) {
return isValid(urlString) ? urlString : null;
}
@@ -140,13 +149,19 @@ public class UrlValidator implements URL
public void setConf(Configuration conf) {
this.conf = conf;
+ maxTldLength = conf.getInt(TOP_LEVEL_DOMAIN_LENGTH, 8);
+ if (maxTldLength <= 2)
+ maxTldLength = TOP_LEVEL_DOMAIN_LENGTH_VALUE;
}
/**
- * <p>Checks if a field has a valid url address.</p>
- *
- * @param value The value validation is being performed on.
- * A <code>null</code> value is considered invalid.
+ * <p>
+ * Checks if a field has a valid url address.
+ * </p>
+ *
+ * @param value
+ * The value validation is being performed on. A <code>null</code>
+ * value is considered invalid.
* @return true if the url is valid.
*/
private boolean isValid(String value) {
@@ -184,11 +199,13 @@ public class UrlValidator implements URL
}
/**
- * Validate scheme. If schemes[] was initialized to a non null,
- * then only those scheme's are allowed. Note this is slightly different
- * than for the constructor.
- * @param scheme The scheme to validate. A <code>null</code> value is
- * considered invalid.
+ * Validate scheme. If schemes[] was initialized to a non null, then only
+ * those scheme's are allowed. Note this is slightly different than for the
+ * constructor.
+ *
+ * @param scheme
+ * The scheme to validate. A <code>null</code> value is considered
+ * invalid.
* @return true if valid.
*/
private boolean isValidScheme(String scheme) {
@@ -200,10 +217,12 @@ public class UrlValidator implements URL
}
/**
- * Returns true if the authority is properly formatted. An authority is
- * the combination of hostname and port. A <code>null</code> authority
- * value is considered invalid.
- * @param authority Authority value to validate.
+ * Returns true if the authority is properly formatted. An authority is the
+ * combination of hostname and port. A <code>null</code> authority value is
+ * considered invalid.
+ *
+ * @param authority
+ * Authority value to validate.
* @return true if authority (hostname and port) is valid.
*/
private boolean isValidAuthority(String authority) {
@@ -235,7 +254,7 @@ public class UrlValidator implements URL
if (Integer.parseInt(ipSegment) > 255) {
return false;
}
- } catch(NumberFormatException e) {
+ } catch (NumberFormatException e) {
return false;
}
@@ -251,8 +270,8 @@ public class UrlValidator implements URL
// TODO: Rewrite to use ArrayList and .add semantics: see VALIDATOR-203
char[] chars = hostIP.toCharArray();
int size = 1;
- for(int i=0; i<chars.length; i++) {
- if(chars[i] == '.') {
+ for (int i = 0; i < chars.length; i++) {
+ if (chars[i] == '.') {
size++;
}
}
@@ -264,12 +283,12 @@ public class UrlValidator implements URL
while (atomMatcher.find()) {
domainSegment[segCount] = atomMatcher.group();
segLen = domainSegment[segCount].length() + 1;
- hostIP = (segLen >= hostIP.length()) ? ""
- : hostIP.substring(segLen);
+ hostIP = (segLen >= hostIP.length()) ? "" : hostIP.substring(segLen);
segCount++;
}
String topLevel = domainSegment[segCount - 1];
- if (topLevel.length() < 2 || topLevel.length() > 4) {
+ if (topLevel.length() < 2
+ || topLevel.length() > maxTldLength) {
return false;
}
@@ -300,10 +319,13 @@ public class UrlValidator implements URL
}
/**
- * <p>Checks if the field isn't null and length of the field is greater
- * than zero not including whitespace.</p>
- *
- * @param value The value validation is being performed on.
+ * <p>
+ * Checks if the field isn't null and length of the field is greater than zero
+ * not including whitespace.
+ * </p>
+ *
+ * @param value
+ * The value validation is being performed on.
* @return true if blank or null.
*/
private boolean isBlankOrNull(String value) {
@@ -311,9 +333,11 @@ public class UrlValidator implements URL
}
/**
- * Returns true if the path is valid. A <code>null</code> value is
- * considered invalid.
- * @param path Path value to validate.
+ * Returns true if the path is valid. A <code>null</code> value is considered
+ * invalid.
+ *
+ * @param path
+ * Path value to validate.
* @return true if path is valid.
*/
private boolean isValidPath(String path) {
@@ -335,7 +359,9 @@ public class UrlValidator implements URL
/**
* Returns true if the query is null or it's a properly formatted query
* string.
- * @param query Query value to validate.
+ *
+ * @param query
+ * Query value to validate.
* @return true if query is valid.
*/
private boolean isValidQuery(String query) {
@@ -348,8 +374,11 @@ public class UrlValidator implements URL
/**
* Returns the number of times the token appears in the target.
- * @param token Token value to be counted.
- * @param target Target value to count tokens in.
+ *
+ * @param token
+ * Token value to be counted.
+ * @param target
+ * Target value to count tokens in.
* @return the number of tokens.
*/
private int countToken(String token, String target) {
Added: nutch/branches/2.x/src/plugin/urlfilter-validator/src/test/org/apache/nutch/urlfilter/validator/TestUrlValidator.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/urlfilter-validator/src/test/org/apache/nutch/urlfilter/validator/TestUrlValidator.java?rev=1582410&view=auto
==============================================================================
--- nutch/branches/2.x/src/plugin/urlfilter-validator/src/test/org/apache/nutch/urlfilter/validator/TestUrlValidator.java (added)
+++ nutch/branches/2.x/src/plugin/urlfilter-validator/src/test/org/apache/nutch/urlfilter/validator/TestUrlValidator.java Thu Mar 27 17:18:55 2014
@@ -0,0 +1,131 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.validator;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.urlfilter.validator.UrlValidator;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Before;
+import org.junit.Test;
+
+import junit.framework.TestCase;
+
+/**
+ * JUnit test case which tests 1. that valid urls are not filtered while invalid
+ * ones are filtered. 2. that Urls' scheme, authority, path and query are
+ * validated. Also checks valid length of tld.
+ *
+ *
+ */
+
+public class TestUrlValidator extends TestCase {
+ private Configuration conf;
+ private static int tldLength;
+ private String validUrl;
+ private String invalidUrl;
+ private String preUrl = "http://example.";
+
+ @Before
+ public void setUp() throws Exception {
+ super.setUp();
+ conf = NutchConfiguration.create();
+ tldLength = conf.getInt("urlfilter.tld.length", 8);
+ }
+
+ /**
+ * Test method for
+ * {@link org.apache.nutch.urlfilter.validator.UrlValidator#filter(java.lang.String)}
+ * .
+ */
+ @Test
+ public void testFilter() {
+ UrlValidator url_validator = new UrlValidator();
+ url_validator.setConf(conf);
+
+ validUrl = generateValidTld(tldLength);
+ invalidUrl = generateInvalidTld(tldLength);
+
+ assertNotNull(url_validator);
+
+ // invalid urls
+ assertNull("Filtering on a null object should return null",
+ url_validator.filter(null));
+ assertNull("Invalid url: example.com/file[/].html",
+ url_validator.filter("example.com/file[/].html"));
+ assertNull("Invalid url: http://www.example.com/space here.html",
+ url_validator.filter("http://www.example.com/space here.html"));
+ assertNull("Invalid url: /main.html", url_validator.filter("/main.html"));
+ assertNull("Invalid url: www.example.com/main.html",
+ url_validator.filter("www.example.com/main.html"));
+ assertNull("Invalid url: ftp:www.example.com/main.html",
+ url_validator.filter("ftp:www.example.com/main.html"));
+ assertNull("Inalid url: http://999.000.456.32/nutch/trunk/README.txt",
+ url_validator.filter("http://999.000.456.32/nutch/trunk/README.txt"));
+ assertNull("Invalid url: http://www.example.com/ma|in\\toc.html",
+ url_validator.filter(" http://www.example.com/ma|in\\toc.html"));
+ // test tld limit
+ assertNull("InValid url: " + invalidUrl, url_validator.filter(invalidUrl));
+
+ // valid urls
+ assertNotNull("Valid url: https://issues.apache.org/jira/NUTCH-1127",
+ url_validator.filter("https://issues.apache.org/jira/NUTCH-1127"));
+ assertNotNull(
+ "Valid url: http://domain.tld/function.cgi?url=http://fonzi.com/&name=Fonzi&mood=happy&coat=leather",
+ url_validator
+ .filter("http://domain.tld/function.cgi?url=http://fonzi.com/&name=Fonzi&mood=happy&coat=leather"));
+ assertNotNull(
+ "Valid url: http://validator.w3.org/feed/check.cgi?url=http%3A%2F%2Ffeeds.feedburner.com%2Fperishablepress",
+ url_validator
+ .filter("http://validator.w3.org/feed/check.cgi?url=http%3A%2F%2Ffeeds.feedburner.com%2Fperishablepress"));
+ assertNotNull("Valid url: ftp://alfa.bravo.pi/foo/bar/plan.pdf",
+ url_validator.filter("ftp://alfa.bravo.pi/mike/check/plan.pdf"));
+ // test tld limit
+ assertNotNull("Valid url: " + validUrl, url_validator.filter(validUrl));
+
+ }
+
+ /**
+ * Generate Sample of Valid Tld.
+ */
+ public String generateValidTld(int length) {
+ StringBuffer buffer = new StringBuffer();
+ for (int i = 1; i <= length; i++) {
+
+ char c = (char) ('a' + Math.random() * 26);
+ buffer.append(c);
+ }
+ String tempValidUrl = preUrl + buffer.toString();
+ return tempValidUrl;
+ }
+
+ /**
+ * Generate Sample of Invalid Tld.
+ * character
+ */
+ public String generateInvalidTld(int length) {
+
+ StringBuffer buffer = new StringBuffer();
+ for (int i = 1; i <= length + 1; i++) {
+
+ char c = (char) ('a' + Math.random() * 26);
+ buffer.append(c);
+ }
+ String tempInvalidUrl = preUrl + buffer.toString();
+ return tempInvalidUrl;
+
+ }
+}