You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by si...@apache.org on 2007/03/11 07:50:11 UTC
svn commit: r516862 -
/lucene/nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
Author: siren
Date: Sat Mar 10 22:50:10 2007
New Revision: 516862
URL: http://svn.apache.org/viewvc?view=rev&rev=516862
Log:
change urlnormalizer-basic to use regular expressions from jre
Modified:
lucene/nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
Modified: lucene/nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java?view=diff&rev=516862&r1=516861&r2=516862
==============================================================================
--- lucene/nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java (original)
+++ lucene/nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java Sat Mar 10 22:50:10 2007
@@ -1,4 +1,4 @@
-/**
+/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
@@ -14,60 +14,62 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-
package org.apache.nutch.net.urlnormalizer.basic;
import java.net.URL;
import java.net.MalformedURLException;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
-// Commons Logging imports
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
-// Nutch imports
import org.apache.nutch.net.URLNormalizer;
-import org.apache.nutch.util.LogUtil;
import org.apache.hadoop.conf.Configuration;
-import org.apache.oro.text.regex.*;
-/** Converts URLs to a normal form . */
+/**
+ * Converts URLs to a normal form.
+ * <p>
+ * All substitutions will be done step by step, to ensure that certain
+ * constellations will be normalized, too.
+ * </p>
+ * <p>
+ * For example: "/aa/bb/../../cc/../foo.html will be normalized in the following
+ * manner: "/aa/bb/../../cc/../foo.html" "/aa/../cc/../foo.html"
+ * "/cc/../foo.html" "/foo.html".
+ * </p>
+ * <p>
+ * The normalization also takes care of leading "/../", which will be replaced
+ * by "/", because this is a rather a sign of bad webserver configuration than
+ * of a wanted link. For example, urls like "http://www.foo.com/../" should
+ * return a http 404 error instead of redirecting to "http://www.foo.com".
+ * </p>
+ */
public class BasicURLNormalizer implements URLNormalizer {
public static final Log LOG = LogFactory.getLog(BasicURLNormalizer.class);
- private Perl5Compiler compiler = new Perl5Compiler();
- private ThreadLocal matchers = new ThreadLocal() {
- protected synchronized Object initialValue() {
- return new Perl5Matcher();
- }
- };
- private Rule relativePathRule = null;
- private Rule leadingRelativePathRule = null;
+ /**
+ * This pattern tries to find spots like "/xx/../" in the url, which could
+ * be replaced by "/" xx consists of chars, different then "/" (slash) and
+ * needs to have at least one char different from ".".
+ */
+ private static final Pattern RELATIVE_PATH_PATTERN = Pattern.compile("(/[^/]*[^/.]{1}[^/]*/\\.\\./)");
+
+ private static final String RELATIVE_PATH_SUBSTITUTION="/";
+
+ /**
+ * This pattern tries to find spots like leading "/../" in the url, which
+ * could be replaced by "/".
+ */
+ private static final Pattern LEADING_RELATIVE_PATH_PATTERN = Pattern.compile("^(/\\.\\./)+");
+
+ private static final String LEADING_RELATIVE_PATH_SUBSTITUTION="/";
private Configuration conf;
+
public BasicURLNormalizer() {
- try {
- // this pattern tries to find spots like "/xx/../" in the url, which
- // could be replaced by "/" xx consists of chars, different then "/"
- // (slash) and needs to have at least one char different from "."
- relativePathRule = new Rule();
- relativePathRule.pattern = (Perl5Pattern)
- compiler.compile("(/[^/]*[^/.]{1}[^/]*/\\.\\./)",
- Perl5Compiler.READ_ONLY_MASK);
- relativePathRule.substitution = new Perl5Substitution("/");
-
- // this pattern tries to find spots like leading "/../" in the url,
- // which could be replaced by "/"
- leadingRelativePathRule = new Rule();
- leadingRelativePathRule.pattern = (Perl5Pattern)
- compiler.compile("^(/\\.\\./)+", Perl5Compiler.READ_ONLY_MASK);
- leadingRelativePathRule.substitution = new Perl5Substitution("/");
-
- } catch (MalformedPatternException e) {
- e.printStackTrace(LogUtil.getWarnStream(LOG));
- throw new RuntimeException(e);
- }
}
public String normalize(String urlString, String scope)
@@ -129,56 +131,25 @@
return urlString;
}
- private String substituteUnnecessaryRelativePaths(String file) {
- String fileWorkCopy = file;
- int oldLen = file.length();
- int newLen = oldLen - 1;
-
- // All substitutions will be done step by step, to ensure that certain
- // constellations will be normalized, too
- //
- // For example: "/aa/bb/../../cc/../foo.html will be normalized in the
- // following manner:
- // "/aa/bb/../../cc/../foo.html"
- // "/aa/../cc/../foo.html"
- // "/cc/../foo.html"
- // "/foo.html"
- //
- // The normalization also takes care of leading "/../", which will be
- // replaced by "/", because this is a rather a sign of bad webserver
- // configuration than of a wanted link. For example, urls like
- // "http://www.foo.com/../" should return a http 404 error instead of
- // redirecting to "http://www.foo.com".
- //
- Perl5Matcher matcher = (Perl5Matcher)matchers.get();
-
- while (oldLen != newLen) {
- // substitue first occurence of "/xx/../" by "/"
- oldLen = fileWorkCopy.length();
- fileWorkCopy = Util.substitute
- (matcher, relativePathRule.pattern,
- relativePathRule.substitution, fileWorkCopy, 1);
-
- // remove leading "/../"
- fileWorkCopy = Util.substitute
- (matcher, leadingRelativePathRule.pattern,
- leadingRelativePathRule.substitution, fileWorkCopy, 1);
- newLen = fileWorkCopy.length();
- }
-
- return fileWorkCopy;
- }
-
-
- /**
- * Class which holds a compiled pattern and its corresponding substition
- * string.
- */
- private static class Rule {
- public Perl5Pattern pattern;
- public Perl5Substitution substitution;
+ private String substituteUnnecessaryRelativePaths(String file) {
+ String fileWorkCopy = file;
+ int oldLen = file.length();
+ int newLen = oldLen - 1;
+ Matcher m;
+
+ while (oldLen != newLen) {
+ oldLen = fileWorkCopy.length();
+ m = RELATIVE_PATH_PATTERN.matcher(fileWorkCopy);
+ // substitue first occurence of "/xx/../" by "/"
+ fileWorkCopy = m.replaceFirst(RELATIVE_PATH_SUBSTITUTION);
+ m = LEADING_RELATIVE_PATH_PATTERN.matcher(fileWorkCopy);
+ // remove leading "/../"
+ fileWorkCopy = m.replaceFirst(LEADING_RELATIVE_PATH_SUBSTITUTION);
+ newLen = fileWorkCopy.length();
}
+ return fileWorkCopy;
+ }
public void setConf(Configuration conf) {
this.conf = conf;
@@ -189,4 +160,3 @@
}
}
-