You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by si...@apache.org on 2007/03/11 22:18:24 UTC
svn commit: r517015 - in /lucene/nutch/trunk: ./ lib/
src/java/org/apache/nutch/parse/ src/plugin/
src/plugin/index-more/src/java/org/apache/nutch/indexer/more/
src/plugin/index-more/src/test/ src/plugin/ontology/lib/
src/plugin/parse-js/src/java/org/a...
Author: siren
Date: Sun Mar 11 14:18:23 2007
New Revision: 517015
URL: http://svn.apache.org/viewvc?view=rev&rev=517015
Log:
merging 517012:516728 excluding changes made by dennis
Added:
lucene/nutch/trunk/lib/commons-logging-api-1.0.4.jar
- copied unchanged from r516728, lucene/nutch/trunk/lib/commons-logging-api-1.0.4.jar
lucene/nutch/trunk/lib/jakarta-oro-2.0.7.jar
- copied unchanged from r516728, lucene/nutch/trunk/lib/jakarta-oro-2.0.7.jar
lucene/nutch/trunk/src/plugin/ontology/lib/commons-logging-1.0.3.LICENSE.txt
- copied unchanged from r516728, lucene/nutch/trunk/src/plugin/ontology/lib/commons-logging-1.0.3.LICENSE.txt
lucene/nutch/trunk/src/plugin/ontology/lib/commons-logging-1.0.3.jar
- copied unchanged from r516728, lucene/nutch/trunk/src/plugin/ontology/lib/commons-logging-1.0.3.jar
lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/package.html
- copied unchanged from r516728, lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/package.html
Removed:
lucene/nutch/trunk/src/plugin/index-more/src/test/
lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/package.html
lucene/nutch/trunk/src/plugin/parse-js/src/test/
Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/build.xml
lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java
lucene/nutch/trunk/src/plugin/build.xml
lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
lucene/nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
lucene/nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java
Modified: lucene/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=517015&r1=517014&r2=517015
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Sun Mar 11 14:18:23 2007
@@ -158,18 +158,11 @@
53. NUTCH-384 - Protocol-file plugin does not allow the parse plugins
framework to operate properly (Heiko Dietze via mattmann)
-54. Change OutlinkExtractor to use Regular Expressions from JRE (siren)
-
-55. NUTCH-233 - Wrong regular expression hangs reduce process forever (Stefan
+54. NUTCH-233 - Wrong regular expression hangs reduce process forever (Stefan
Groschupf via kubes)
-56. NUTCH-436 - Incorrect handling of relative paths when the embedded URL
- path is empty (kubes)
-
-57. Replace oro with jre regular expressions in plugins, remove oro from
- dependencies (siren)
-
-58. Remove redundant commons logging jars (siren)
+55. NUTCH-436 - Incorrect handling of relative paths when the embedded URL
+ path is empty (kubes)
Release 0.8 - 2006-07-25
Modified: lucene/nutch/trunk/build.xml
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/build.xml?view=diff&rev=517015&r1=517014&r2=517015
==============================================================================
--- lucene/nutch/trunk/build.xml (original)
+++ lucene/nutch/trunk/build.xml Sun Mar 11 14:18:23 2007
@@ -148,20 +148,8 @@
<jar jarfile="${build.dir}/${final.name}.job">
<zipfileset dir="${build.classes}"/>
<zipfileset dir="${conf.dir}" excludes="*.template"/>
- <zipfileset dir="${lib.dir}" prefix="lib" includes="**/*.jar">
- <exclude name="hadoop-*.jar"/>
- <exclude name="servlet-*.jar"/>
- <exclude name="junit*.jar"/>
- <exclude name="jetty-*.jar"/>
- <exclude name="pmd-ext/*"/>
- <exclude name="jetty-ext/*"/>
- <exclude name="jets3t.jar"/>
- <exclude name="taglib*.jar"/>
- <exclude name="commons-cli*.jar"/>
- <exclude name="xerces-*.jar"/>
- <exclude name="commons-logging-1.0.4.jar"/>
- <exclude name="log4j-1.2.13.jar"/>
- </zipfileset>
+ <zipfileset dir="${lib.dir}" prefix="lib"
+ includes="**/*.jar" excludes="hadoop-*.jar"/>
<zipfileset dir="${build.plugins}" prefix="plugins"/>
</jar>
</target>
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java?view=diff&rev=517015&r1=517014&r2=517015
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java Sun Mar 11 14:18:23 2007
@@ -1,4 +1,4 @@
-/*
+/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
@@ -14,21 +14,28 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
+
package org.apache.nutch.parse;
import java.net.MalformedURLException;
import java.util.ArrayList;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
+import java.util.List;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
+import org.apache.oro.text.regex.MatchResult;
+import org.apache.oro.text.regex.Pattern;
+import org.apache.oro.text.regex.PatternCompiler;
+import org.apache.oro.text.regex.PatternMatcher;
+import org.apache.oro.text.regex.PatternMatcherInput;
+import org.apache.oro.text.regex.Perl5Compiler;
+import org.apache.oro.text.regex.Perl5Matcher;
/**
- * Extractor to extract {@link org.apache.nutch.parse.Outlink}s / URLs from
- * plain text using Regular Expressions.
+ * Extractor to extract {@link org.apache.nutch.parse.Outlink}s
+ * / URLs from plain text using Regular Expressions.
*
* @see <a
* href="http://wiki.java.net/bin/view/Javapedia/RegularExpressions">Comparison
@@ -37,14 +44,12 @@
* </a>
*
* @author Stephan Strittmatter - http://www.sybit.de
- *
+ * @version 1.0
* @since 0.7
*/
public class OutlinkExtractor {
private static final Log LOG = LogFactory.getLog(OutlinkExtractor.class);
- private static final Outlink[] NO_LINKS = new Outlink[0];
-
/**
* Regex pattern to get URLs within a plain text.
*
@@ -52,63 +57,190 @@
* href="http://www.truerwords.net/articles/ut/urlactivation.html">http://www.truerwords.net/articles/ut/urlactivation.html
* </a>
*/
- private static final String URL_PATTERN = "([A-Za-z][A-Za-z0-9+.-]{1,120}:[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2}){1,333}(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]{0,1000}))?)";
-
- static final Pattern urlPattern = Pattern.compile(URL_PATTERN);
+ private static final String URL_PATTERN =
+ "([A-Za-z][A-Za-z0-9+.-]{1,120}:[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2}){1,333}(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]{0,1000}))?)";
/**
- * Extracts outlinks from a plain text.
- * </p>
- * @param plainText
+ * Extracts <code>Outlink</code> from given plain text.
+ * Applying this method to non-plain-text can result in extremely lengthy
+ * runtimes for parasitic cases (postscript is a known example).
+ * @param plainText the plain text from wich URLs should be extracted.
*
- * @return Array of <code>Outlink</code> s within found in plainText
+ * @return Array of <code>Outlink</code>s within found in plainText
*/
- public static Outlink[] getOutlinks(final String plainText, Configuration conf){
- return getOutlinks(plainText, null, conf);
+ public static Outlink[] getOutlinks(final String plainText, Configuration conf) {
+ return OutlinkExtractor.getOutlinks(plainText, "", conf);
}
-
/**
- * Extracts outlinks from a plain text.
- * </p>
- * @param plainText text to extract urls from
+ * Extracts <code>Outlink</code> from given plain text and adds anchor
+ * to the extracted <code>Outlink</code>s
+ *
+ * @param plainText the plain text from wich URLs should be extracted.
+ * @param anchor the anchor of the url
*
- * @return Array of <code>Outlink</code> s found in plainText
+ * @return Array of <code>Outlink</code>s within found in plainText
*/
- public static Outlink[] getOutlinks(final String plainText, String anchor,
- Configuration conf) {
-
- if(plainText == null){
- return NO_LINKS;
- }
-
- final ArrayList<Outlink> outlinks = new ArrayList<Outlink>();
- Outlink[] retval;
- Outlink link;
-
- Matcher m = urlPattern.matcher(plainText);
- while (m.find()) {
-
- try {
- link = new Outlink(m.toMatchResult().group(), anchor, conf);
- outlinks.add(link);
- } catch (MalformedURLException ex) {
- // if it is a malformed URL we just throw it away and continue with
- // extraction.
- if (LOG.isDebugEnabled()) {
- LOG.debug("extracted malformed url:" + m.toMatchResult().group(), ex);
+ public static Outlink[] getOutlinks(final String plainText, String anchor, Configuration conf) {
+ long start = System.currentTimeMillis();
+ final List outlinks = new ArrayList();
+
+ try {
+ final PatternCompiler cp = new Perl5Compiler();
+ final Pattern pattern = cp.compile(URL_PATTERN,
+ Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.READ_ONLY_MASK
+ | Perl5Compiler.MULTILINE_MASK);
+ final PatternMatcher matcher = new Perl5Matcher();
+
+ final PatternMatcherInput input = new PatternMatcherInput(plainText);
+
+ MatchResult result;
+ String url;
+
+ //loop the matches
+ while (matcher.contains(input, pattern)) {
+ // if this is taking too long, stop matching
+ // (SHOULD really check cpu time used so that heavily loaded systems
+ // do not unnecessarily hit this limit.)
+ if (System.currentTimeMillis() - start >= 60000L) {
+ if (LOG.isWarnEnabled()) {
+ LOG.warn("Time limit exceeded for getOutLinks");
+ }
+ break;
+ }
+ result = matcher.getMatch();
+ url = result.group(0);
+ try {
+ Outlink outlink = new Outlink(url, anchor, conf);
+ outlinks.add(new Outlink(url, anchor, conf));
+ } catch (MalformedURLException mue) {
+ LOG.warn("Invalid url: '" + url + "', skipping.");
}
}
-
+ } catch (Exception ex) {
+ // if the matcher fails (perhaps a malformed URL) we just log it and move on
+ if (LOG.isErrorEnabled()) { LOG.error("getOutlinks", ex); }
}
- if (outlinks.size() > 0) {
- retval = outlinks.toArray(new Outlink[outlinks.size()]);
+ final Outlink[] retval;
+
+ //create array of the Outlinks
+ if (outlinks != null && outlinks.size() > 0) {
+ retval = (Outlink[]) outlinks.toArray(new Outlink[0]);
} else {
- retval = NO_LINKS;
+ retval = new Outlink[0];
}
return retval;
}
+
+ /**
+ * Extracts outlinks from a plain text. <br />
+ * This Method takes the Jakarta Regexp API.
+ *
+ * @param plainText
+ *
+ * @return Array of <code>Outlink</code> s within found in plainText
+ * @deprecated only for tests
+ */
+ private Outlink[] getOutlinksJakartaRegexpImpl(final String plainText) {
+
+ throw new UnsupportedOperationException(
+ "Implementation commented out. Please uncomment to use it.");
+
+ // final List outlinks = new ArrayList();
+ // String url;
+ // Outlink link;
+ //
+ // RE re = new RE(URL_PATTERN);
+ //
+ // int pos = 0;
+ //
+ // while (re.match(plainText, pos)) {
+ //
+ // url = re.getParen(0);
+ //
+ // if (LOG.isTraceEnabled()) {
+ // LOG.trace("Extracted url: " + url);
+ // }
+ //
+ // try {
+ //
+ // link = new Outlink(url, null);
+ // outlinks.add(link);
+ //
+ // } catch (MalformedURLException ex) {
+ // // if it is a malformed URL we just throw it away and continue with
+ // // extraction.
+ // if (LOG.isErrorEnabled()) { LOG.error("getOutlinks", ex); }
+ // }
+ //
+ // pos = re.getParenEnd(0);
+ // }
+ //
+ // final Outlink[] retval;
+ //
+ // if (pos > 0) {
+ // retval = (Outlink[]) outlinks.toArray(new Outlink[0]);
+ // } else {
+ // retval = new Outlink[0];
+ // }
+ //
+ // return retval;
+
+ }
+
+ /**
+ * Extracts outlinks from a plain text.
+ * </p>
+ * This Method takes the JDK5 Regexp API.
+ *
+ * @param plainText
+ *
+ * @return Array of <code>Outlink</code> s within found in plainText
+ * @deprecated only for tests
+ */
+ private Outlink[] getOutlinksJDK5Impl(final String plainText) {
+
+ throw new UnsupportedOperationException(
+ "Implementation commented out. Please uncomment to use it.");
+
+ // final List outlinks = new ArrayList();
+ // String url;
+ // Outlink link;
+ //
+ // final Pattern urlPattern = Pattern.compile(URL_PATTERN);
+ // final RE re = new RE(urlPattern);
+ //
+ // int pos = 0;
+ //
+ // while (re.match(plainText, pos)) {
+ //
+ // url = re.getParen(0);
+ //
+ // try {
+ //
+ // link = new Outlink(url, null);
+ // outlinks.add(link);
+ // } catch (MalformedURLException ex) {
+ // // if it is a malformed URL we just throw it away and continue with
+ // // extraction.
+ // if (LOG.isErrorEnabled()) { LOG.error("getOutlinks", ex); }
+ // }
+ //
+ // pos = re.getParenEnd(0);
+ // }
+ //
+ // final Outlink[] retval;
+ //
+ // if (pos > 0) {
+ // retval = (Outlink[]) outlinks.toArray(new Outlink[0]);
+ // } else {
+ // retval = new Outlink[0];
+ // }
+ //
+ // return retval;
+ }
+
}
Modified: lucene/nutch/trunk/src/plugin/build.xml
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/build.xml?view=diff&rev=517015&r1=517014&r2=517015
==============================================================================
--- lucene/nutch/trunk/src/plugin/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/build.xml Sun Mar 11 14:18:23 2007
@@ -83,13 +83,11 @@
<target name="test">
<parallel threadCount="2">
<ant dir="creativecommons" target="test"/>
- <ant dir="index-more" target="test"/>
<ant dir="languageidentifier" target="test"/>
<ant dir="lib-http" target="test"/>
<ant dir="ontology" target="test"/>
<!--ant dir="parse-ext" target="test"/-->
<ant dir="parse-html" target="test"/>
- <ant dir="parse-js" target="test"/>
<!-- <ant dir="parse-mp3" target="test"/> -->
<ant dir="parse-msexcel" target="test"/>
<ant dir="parse-mspowerpoint" target="test"/>
Modified: lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java?view=diff&rev=517015&r1=517014&r2=517015
==============================================================================
--- lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java (original)
+++ lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java Sun Mar 11 14:18:23 2007
@@ -16,6 +16,14 @@
*/
package org.apache.nutch.indexer.more;
+
+import org.apache.oro.text.regex.Perl5Compiler;
+import org.apache.oro.text.regex.Perl5Matcher;
+import org.apache.oro.text.regex.Perl5Pattern;
+import org.apache.oro.text.regex.PatternMatcher;
+import org.apache.oro.text.regex.MatchResult;
+import org.apache.oro.text.regex.MalformedPatternException;
+
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
@@ -48,8 +56,6 @@
import java.util.Date;
import java.util.TimeZone;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
import org.apache.commons.lang.time.DateUtils;
@@ -238,15 +244,21 @@
// Patterns used to extract filename from possible non-standard
// HTTP header "Content-Disposition". Typically it looks like:
// Content-Disposition: inline; filename="foo.ppt"
+ private PatternMatcher matcher = new Perl5Matcher();
private Configuration conf;
- static Pattern patterns[] = new Pattern[2];
+ static Perl5Pattern patterns[] = {null, null};
static {
+ Perl5Compiler compiler = new Perl5Compiler();
+ try {
// order here is important
patterns[0] =
- Pattern.compile("\\bfilename=['\"](.+)['\"]");
+ (Perl5Pattern) compiler.compile("\\bfilename=['\"](.+)['\"]");
patterns[1] =
- Pattern.compile("\\bfilename=(\\S+)\\b");
+ (Perl5Pattern) compiler.compile("\\bfilename=(\\S+)\\b");
+ } catch (MalformedPatternException e) {
+ // just ignore
+ }
}
private Document resetTitle(Document doc, ParseData data, String url) {
@@ -254,28 +266,16 @@
if (contentDisposition == null)
return doc;
- String filename = getFileName(contentDisposition);
-
- if (filename != null) {
- doc.add(new Field("title", filename, Field.Store.YES, Field.Index.NO));
- }
-
- return doc;
- }
-
- String getFileName(String value) {
-
- String filename = null;
-
- for (int i = 0; i < patterns.length; i++) {
- Matcher matcher = patterns[i].matcher(value);
- if(matcher.find()) {
- filename = matcher.group(1);
+ MatchResult result;
+ for (int i=0; i<patterns.length; i++) {
+ if (matcher.contains(contentDisposition,patterns[i])) {
+ result = matcher.getMatch();
+ doc.add(new Field("title", result.group(1), Field.Store.YES, Field.Index.NO));
break;
}
}
- return filename;
+ return doc;
}
public void setConf(Configuration conf) {
Modified: lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java?view=diff&rev=517015&r1=517014&r2=517015
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java Sun Mar 11 14:18:23 2007
@@ -25,8 +25,6 @@
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
@@ -42,6 +40,13 @@
import org.apache.nutch.protocol.Content;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.hadoop.conf.Configuration;
+import org.apache.oro.text.regex.MatchResult;
+import org.apache.oro.text.regex.Pattern;
+import org.apache.oro.text.regex.PatternCompiler;
+import org.apache.oro.text.regex.PatternMatcher;
+import org.apache.oro.text.regex.PatternMatcherInput;
+import org.apache.oro.text.regex.Perl5Compiler;
+import org.apache.oro.text.regex.Perl5Matcher;
import org.w3c.dom.DocumentFragment;
import org.w3c.dom.Element;
import org.w3c.dom.NamedNodeMap;
@@ -49,24 +54,11 @@
import org.w3c.dom.NodeList;
/**
- * <p>
- * This class is a heuristic link extractor for JavaScript files and code
- * snippets. The general idea of a two-pass regex matching comes from Heritrix.
- * Parts of the code come from OutlinkExtractor.java by Stephan Strittmatter.
- * </p>
- *
- * <p>
- * This Filter extracts javascript from following locations:
- * </p>
- * <li>from inside <script> tags</li>
- * <li>from html 4.0 events like Window: onload,onunload, Form:
- * onchange,onsubmit,onreset,onselect,onblur,onfocus Keyboard:
- * onkeydown,onkeypress,onkeyup Mouse:
- * onclick,ondbclick,onmousedown,onmouseout,onmousover,onmouseup
- * </li>
- * <li>a href starting with literal "javascript"</li>
- *
- *
+ * This class is a heuristic link extractor for JavaScript files and
+ * code snippets. The general idea of a two-pass regex matching comes from
+ * Heritrix. Parts of the code come from OutlinkExtractor.java
+ * by Stephan Strittmatter.
+ *
* @author Andrzej Bialecki <ab@getopt.org>
*/
public class JSParseFilter implements HtmlParseFilter, Parser {
@@ -105,7 +97,6 @@
Node lNode = n.getAttributes().getNamedItem("language");
if (lNode == null) lang = "javascript";
else lang = lNode.getNodeValue();
- //XXX lang is not checked??
StringBuffer script = new StringBuffer();
NodeList nn = n.getChildNodes();
if (nn.getLength() > 0) {
@@ -113,9 +104,9 @@
if (i > 0) script.append('\n');
script.append(nn.item(i).getNodeValue());
}
- if (LOG.isDebugEnabled()) {
- LOG.info("script: language=" + lang + ", text: " + script.toString());
- }
+ // if (LOG.isInfoEnabled()) {
+ // LOG.info("script: language=" + lang + ", text: " + script.toString());
+ // }
Outlink[] links = getJSLinks(script.toString(), "", base);
if (links != null && links.length > 0) outlinks.addAll(Arrays.asList(links));
// no other children of interest here, go one level up.
@@ -184,7 +175,7 @@
/**
* This method extracts URLs from literals embedded in JavaScript.
*/
- Outlink[] getJSLinks(String plainText, String anchor, String base) {
+ private Outlink[] getJSLinks(String plainText, String anchor, String base) {
final List outlinks = new ArrayList();
URL baseURL = null;
@@ -196,27 +187,30 @@
}
try {
- final Pattern stringPattern = Pattern.compile(STRING_PATTERN,
- Pattern.CASE_INSENSITIVE | Pattern.MULTILINE);
- final Pattern urlPattern = Pattern.compile(URI_PATTERN,
- Pattern.CASE_INSENSITIVE | Pattern.MULTILINE);
-
- final Matcher quoted = stringPattern.matcher(plainText);
+ final PatternCompiler cp = new Perl5Compiler();
+ final Pattern pattern = cp.compile(STRING_PATTERN,
+ Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.READ_ONLY_MASK
+ | Perl5Compiler.MULTILINE_MASK);
+ final Pattern pattern1 = cp.compile(URI_PATTERN,
+ Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.READ_ONLY_MASK
+ | Perl5Compiler.MULTILINE_MASK);
+ final PatternMatcher matcher = new Perl5Matcher();
+
+ final PatternMatcher matcher1 = new Perl5Matcher();
+ final PatternMatcherInput input = new PatternMatcherInput(plainText);
+ MatchResult result;
String url;
//loop the matches
- while (quoted.find()) {
- String quotedString = quoted.group(2);
- Matcher urls = urlPattern.matcher(quotedString);
-
- if (!urls.find()) {
+ while (matcher.contains(input, pattern)) {
+ result = matcher.getMatch();
+ url = result.group(2);
+ PatternMatcherInput input1 = new PatternMatcherInput(url);
+ if (!matcher1.matches(input1, pattern1)) {
//if (LOG.isTraceEnabled()) { LOG.trace(" - invalid '" + url + "'"); }
continue;
}
-
- url = urls.group();
-
if (url.startsWith("www.")) {
url = "http://" + url;
} else {
Modified: lucene/nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java?view=diff&rev=517015&r1=517014&r2=517015
==============================================================================
--- lucene/nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java (original)
+++ lucene/nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java Sun Mar 11 14:18:23 2007
@@ -1,4 +1,4 @@
-/*
+/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
@@ -14,62 +14,60 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
+
package org.apache.nutch.net.urlnormalizer.basic;
import java.net.URL;
import java.net.MalformedURLException;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
+// Commons Logging imports
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
+// Nutch imports
import org.apache.nutch.net.URLNormalizer;
+import org.apache.nutch.util.LogUtil;
import org.apache.hadoop.conf.Configuration;
+import org.apache.oro.text.regex.*;
-/**
- * Converts URLs to a normal form.
- * <p>
- * All substitutions will be done step by step, to ensure that certain
- * constellations will be normalized, too.
- * </p>
- * <p>
- * For example: "/aa/bb/../../cc/../foo.html will be normalized in the following
- * manner: "/aa/bb/../../cc/../foo.html" "/aa/../cc/../foo.html"
- * "/cc/../foo.html" "/foo.html".
- * </p>
- * <p>
- * The normalization also takes care of leading "/../", which will be replaced
- * by "/", because this is a rather a sign of bad webserver configuration than
- * of a wanted link. For example, urls like "http://www.foo.com/../" should
- * return a http 404 error instead of redirecting to "http://www.foo.com".
- * </p>
- */
+/** Converts URLs to a normal form . */
public class BasicURLNormalizer implements URLNormalizer {
public static final Log LOG = LogFactory.getLog(BasicURLNormalizer.class);
- /**
- * This pattern tries to find spots like "/xx/../" in the url, which could
- * be replaced by "/" xx consists of chars, different then "/" (slash) and
- * needs to have at least one char different from ".".
- */
- private static final Pattern RELATIVE_PATH_PATTERN = Pattern.compile("(/[^/]*[^/.]{1}[^/]*/\\.\\./)");
-
- private static final String RELATIVE_PATH_SUBSTITUTION="/";
-
- /**
- * This pattern tries to find spots like leading "/../" in the url, which
- * could be replaced by "/".
- */
- private static final Pattern LEADING_RELATIVE_PATH_PATTERN = Pattern.compile("^(/\\.\\./)+");
-
- private static final String LEADING_RELATIVE_PATH_SUBSTITUTION="/";
+ private Perl5Compiler compiler = new Perl5Compiler();
+ private ThreadLocal matchers = new ThreadLocal() {
+ protected synchronized Object initialValue() {
+ return new Perl5Matcher();
+ }
+ };
+ private Rule relativePathRule = null;
+ private Rule leadingRelativePathRule = null;
private Configuration conf;
-
public BasicURLNormalizer() {
+ try {
+ // this pattern tries to find spots like "/xx/../" in the url, which
+ // could be replaced by "/" xx consists of chars, different then "/"
+ // (slash) and needs to have at least one char different from "."
+ relativePathRule = new Rule();
+ relativePathRule.pattern = (Perl5Pattern)
+ compiler.compile("(/[^/]*[^/.]{1}[^/]*/\\.\\./)",
+ Perl5Compiler.READ_ONLY_MASK);
+ relativePathRule.substitution = new Perl5Substitution("/");
+
+ // this pattern tries to find spots like leading "/../" in the url,
+ // which could be replaced by "/"
+ leadingRelativePathRule = new Rule();
+ leadingRelativePathRule.pattern = (Perl5Pattern)
+ compiler.compile("^(/\\.\\./)+", Perl5Compiler.READ_ONLY_MASK);
+ leadingRelativePathRule.substitution = new Perl5Substitution("/");
+
+ } catch (MalformedPatternException e) {
+ e.printStackTrace(LogUtil.getWarnStream(LOG));
+ throw new RuntimeException(e);
+ }
}
public String normalize(String urlString, String scope)
@@ -131,25 +129,56 @@
return urlString;
}
- private String substituteUnnecessaryRelativePaths(String file) {
- String fileWorkCopy = file;
- int oldLen = file.length();
- int newLen = oldLen - 1;
- Matcher m;
-
- while (oldLen != newLen) {
- oldLen = fileWorkCopy.length();
- m = RELATIVE_PATH_PATTERN.matcher(fileWorkCopy);
- // substitue first occurence of "/xx/../" by "/"
- fileWorkCopy = m.replaceFirst(RELATIVE_PATH_SUBSTITUTION);
- m = LEADING_RELATIVE_PATH_PATTERN.matcher(fileWorkCopy);
- // remove leading "/../"
- fileWorkCopy = m.replaceFirst(LEADING_RELATIVE_PATH_SUBSTITUTION);
- newLen = fileWorkCopy.length();
+ private String substituteUnnecessaryRelativePaths(String file) {
+ String fileWorkCopy = file;
+ int oldLen = file.length();
+ int newLen = oldLen - 1;
+
+ // All substitutions will be done step by step, to ensure that certain
+ // constellations will be normalized, too
+ //
+ // For example: "/aa/bb/../../cc/../foo.html will be normalized in the
+ // following manner:
+ // "/aa/bb/../../cc/../foo.html"
+ // "/aa/../cc/../foo.html"
+ // "/cc/../foo.html"
+ // "/foo.html"
+ //
+ // The normalization also takes care of leading "/../", which will be
+ // replaced by "/", because this is a rather a sign of bad webserver
+ // configuration than of a wanted link. For example, urls like
+ // "http://www.foo.com/../" should return a http 404 error instead of
+ // redirecting to "http://www.foo.com".
+ //
+ Perl5Matcher matcher = (Perl5Matcher)matchers.get();
+
+ while (oldLen != newLen) {
+ // substitue first occurence of "/xx/../" by "/"
+ oldLen = fileWorkCopy.length();
+ fileWorkCopy = Util.substitute
+ (matcher, relativePathRule.pattern,
+ relativePathRule.substitution, fileWorkCopy, 1);
+
+ // remove leading "/../"
+ fileWorkCopy = Util.substitute
+ (matcher, leadingRelativePathRule.pattern,
+ leadingRelativePathRule.substitution, fileWorkCopy, 1);
+ newLen = fileWorkCopy.length();
+ }
+
+ return fileWorkCopy;
+ }
+
+
+ /**
+ * Class which holds a compiled pattern and its corresponding substition
+ * string.
+ */
+ private static class Rule {
+ public Perl5Pattern pattern;
+ public Perl5Substitution substitution;
}
- return fileWorkCopy;
- }
public void setConf(Configuration conf) {
this.conf = conf;
@@ -160,3 +189,4 @@
}
}
+
Modified: lucene/nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java?view=diff&rev=517015&r1=517014&r2=517015
==============================================================================
--- lucene/nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java (original)
+++ lucene/nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java Sun Mar 11 14:18:23 2007
@@ -1,4 +1,4 @@
-/*
+/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
@@ -14,6 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
+
package org.apache.nutch.net.urlnormalizer.regex;
import java.net.URL;
@@ -27,7 +28,6 @@
import java.util.List;
import java.util.ArrayList;
import java.util.Iterator;
-import java.util.regex.Pattern;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
@@ -40,6 +40,7 @@
import javax.xml.parsers.*;
import org.w3c.dom.*;
+import org.apache.oro.text.regex.*;
/**
* Allows users to do regex substitutions on all/any URLs that are encountered,
@@ -64,14 +65,16 @@
* string.
*/
private static class Rule {
- public Pattern pattern;
+ public Perl5Pattern pattern;
public String substitution;
}
- private HashMap<String, List<Rule>> scopedRules;
+ private HashMap scopedRules;
- private static final List<Rule> EMPTY_RULES = Collections.EMPTY_LIST;
+ private static final List EMPTY_RULES = Collections.EMPTY_LIST;
+
+ private PatternMatcher matcher = new Perl5Matcher();
/**
* The default constructor which is called from UrlNormalizerFactory
@@ -90,9 +93,9 @@
* configuration files for it.
*/
public RegexURLNormalizer(Configuration conf, String filename)
- throws IOException {
+ throws IOException, MalformedPatternException {
super(conf);
- List<Rule> rules = readConfigurationFile(filename);
+ List rules = readConfigurationFile(filename);
if (rules != null)
scopedRules.put(URLNormalizers.SCOPE_DEFAULT, rules);
}
@@ -103,9 +106,9 @@
// the default constructor was called
if (this.scopedRules == null) {
String filename = getConf().get("urlnormalizer.regex.file");
- scopedRules = new HashMap<String, List<Rule>>();
+ scopedRules = new HashMap();
URL url = getConf().getResource(filename);
- List<Rule> rules = null;
+ List rules = null;
if (url == null) {
LOG.warn("Can't load the default config file! " + filename);
rules = EMPTY_RULES;
@@ -123,7 +126,7 @@
// used in JUnit test.
void setConfiguration(InputStream is, String scope) {
- List<Rule> rules = readConfiguration(is);
+ List rules = readConfiguration(is);
scopedRules.put(scope, rules);
LOG.debug("Set config for scope '" + scope + "': " + rules.size() + " rules.");
}
@@ -133,7 +136,7 @@
* patterns. It accepts a string url as input and returns the altered string.
*/
public synchronized String regexNormalize(String urlString, String scope) {
- List<Rule> curRules = scopedRules.get(scope);
+ List curRules = (List)scopedRules.get(scope);
if (curRules == null) {
// try to populate
String configFile = getConf().get("urlnormalizer.regex.file." + scope);
@@ -144,6 +147,7 @@
LOG.warn("Can't load resource for config file: " + configFile);
} else {
try {
+ InputStream is = resource.openStream();
curRules = readConfiguration(resource.openStream());
scopedRules.put(scope, curRules);
} catch (Exception e) {
@@ -158,11 +162,14 @@
}
if (curRules == EMPTY_RULES || curRules == null) {
// use global rules
- curRules = scopedRules.get(URLNormalizers.SCOPE_DEFAULT);
+ curRules = (List)scopedRules.get(URLNormalizers.SCOPE_DEFAULT);
}
-
- for (Rule rule: curRules) {
- urlString = rule.pattern.matcher(urlString).replaceAll(rule.substitution);
+ Iterator i = curRules.iterator();
+ while (i.hasNext()) {
+ Rule r = (Rule) i.next();
+ urlString = Util.substitute(matcher, r.pattern, new Perl5Substitution(
+ r.substitution), urlString, Util.SUBSTITUTE_ALL); // actual
+ // substitution
}
return urlString;
}
@@ -173,7 +180,7 @@
}
/** Reads the configuration file and populates a List of Rules. */
- private List<Rule> readConfigurationFile(String filename) {
+ private List readConfigurationFile(String filename) {
if (LOG.isInfoEnabled()) {
LOG.info("loading " + filename);
}
@@ -186,8 +193,9 @@
}
}
- private List<Rule> readConfiguration(InputStream is) {
- List<Rule> rules = new ArrayList<Rule>();
+ private List readConfiguration(InputStream is) {
+ Perl5Compiler compiler = new Perl5Compiler();
+ List rules = new ArrayList();
try {
// borrowed heavily from code in Configuration.java
@@ -225,7 +233,7 @@
}
if (patternValue != null && subValue != null) {
Rule rule = new Rule();
- rule.pattern = Pattern.compile(patternValue);
+ rule.pattern = (Perl5Pattern) compiler.compile(patternValue);
rule.substitution = subValue;
rules.add(rule);
}
@@ -241,14 +249,15 @@
}
/** Spits out patterns and substitutions that are in the configuration file. */
- public static void main(String args[]) throws IOException {
+ public static void main(String args[]) throws MalformedPatternException,
+ IOException {
RegexURLNormalizer normalizer = new RegexURLNormalizer();
normalizer.setConf(NutchConfiguration.create());
Iterator i = ((List)normalizer.scopedRules.get(URLNormalizers.SCOPE_DEFAULT)).iterator();
System.out.println("* Rules for 'DEFAULT' scope:");
while (i.hasNext()) {
Rule r = (Rule) i.next();
- System.out.print(" " + r.pattern.pattern() + " -> ");
+ System.out.print(" " + r.pattern.getPattern() + " -> ");
System.out.println(r.substitution);
}
// load the scope
@@ -264,7 +273,7 @@
i = ((List)normalizer.scopedRules.get(scope)).iterator();
while (i.hasNext()) {
Rule r = (Rule) i.next();
- System.out.print(" " + r.pattern.pattern() + " -> ");
+ System.out.print(" " + r.pattern.getPattern() + " -> ");
System.out.println(r.substitution);
}
}