You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by cu...@apache.org on 2006/01/09 22:55:32 UTC
svn commit: r367408 -
/lucene/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/net/RegexURLFilter.java
Author: cutting
Date: Mon Jan 9 13:55:31 2006
New Revision: 367408
URL: http://svn.apache.org/viewcvs?rev=367408&view=rev
Log:
NUTCH-160: Switch RegexURLFilter to use Java regex's rather than oro, since Java's seem to be faster & more reliable. By Rod Taylor.
Modified:
lucene/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/net/RegexURLFilter.java
Modified: lucene/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/net/RegexURLFilter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/net/RegexURLFilter.java?rev=367408&r1=367407&r2=367408&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/net/RegexURLFilter.java (original)
+++ lucene/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/net/RegexURLFilter.java Mon Jan 9 13:55:31 2006
@@ -32,12 +32,7 @@
import java.util.ArrayList;
import java.util.Iterator;
import java.util.logging.Logger;
-
-import org.apache.oro.text.regex.Perl5Compiler;
-import org.apache.oro.text.regex.Perl5Matcher;
-import org.apache.oro.text.regex.Perl5Pattern;
-import org.apache.oro.text.regex.PatternMatcher;
-import org.apache.oro.text.regex.MalformedPatternException;
+import java.util.regex.*;
/**
* Filters URLs based on a file of regular expressions. The file is named by
@@ -80,15 +75,14 @@
}
private static class Rule {
- public Perl5Pattern pattern;
+ public Pattern pattern;
public boolean sign;
public String regex;
}
private List rules;
- private PatternMatcher matcher = new Perl5Matcher();
- public RegexURLFilter() throws IOException, MalformedPatternException {
+ public RegexURLFilter() throws IOException, PatternSyntaxException {
String file = NutchConf.get().get("urlfilter.regex.file");
// attribute "file" takes precedence if defined
if (attributeFile != null)
@@ -103,7 +97,7 @@
}
public RegexURLFilter(String filename)
- throws IOException, MalformedPatternException {
+ throws IOException, PatternSyntaxException {
rules = readConfigurationFile(new FileReader(filename));
}
@@ -111,7 +105,9 @@
Iterator i=rules.iterator();
while(i.hasNext()) {
Rule r=(Rule) i.next();
- if (matcher.contains(url,r.pattern)) {
+ Matcher matcher = r.pattern.matcher(url);
+
+ if (matcher.find()) {
//System.out.println("Matched " + r.regex);
return r.sign ? url : null;
}
@@ -129,10 +125,9 @@
//
private static List readConfigurationFile(Reader reader)
- throws IOException, MalformedPatternException {
+ throws IOException, PatternSyntaxException {
BufferedReader in=new BufferedReader(reader);
- Perl5Compiler compiler=new Perl5Compiler();
List rules=new ArrayList();
String line;
@@ -157,7 +152,7 @@
String regex=line.substring(1);
Rule rule=new Rule();
- rule.pattern=(Perl5Pattern) compiler.compile(regex);
+ rule.pattern=Pattern.compile(regex);
rule.sign=sign;
rule.regex=regex;
rules.add(rule);
@@ -167,7 +162,7 @@
}
public static void main(String args[])
- throws IOException, MalformedPatternException {
+ throws IOException, PatternSyntaxException {
RegexURLFilter filter=new RegexURLFilter();
BufferedReader in=new BufferedReader(new InputStreamReader(System.in));