You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ab...@apache.org on 2010/10/21 13:43:37 UTC
svn commit: r1025960 - in /nutch/trunk: ./
src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/
src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/
src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilte...
Author: ab
Date: Thu Oct 21 11:43:37 2010
New Revision: 1025960
URL: http://svn.apache.org/viewvc?rev=1025960&view=rev
Log:
NUTCH-921 Reduce dependency of Nutch on config files.
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java
nutch/trunk/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java
nutch/trunk/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java
nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java
nutch/trunk/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java
nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java
nutch/trunk/src/plugin/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java
Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1025960&r1=1025959&r2=1025960&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Thu Oct 21 11:43:37 2010
@@ -72,6 +72,8 @@ Release 2.0 - Current Development
* NUTCH-832 Website menu has lots of broken links - in particular the API docs (Alex McLintock via mattmann)
+* NUTCH-921 Reduce dependency of Nutch on config files (ab)
+
Release 1.1 - 2010-06-06
Modified: nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java?rev=1025960&r1=1025959&r2=1025960&view=diff
==============================================================================
--- nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java (original)
+++ nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java Thu Oct 21 11:43:37 2010
@@ -17,11 +17,13 @@
package org.apache.nutch.urlfilter.api;
// JDK imports
+import java.io.File;
import java.io.Reader;
import java.io.FileReader;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.IOException;
+import java.io.StringReader;
import java.util.List;
import java.util.ArrayList;
@@ -74,10 +76,21 @@ public abstract class RegexURLFilterBase
* Constructs a new RegexURLFilter and init it with a file of rules.
* @param filename is the name of rules file.
*/
- public RegexURLFilterBase(String filename)
+ public RegexURLFilterBase(File filename)
throws IOException, IllegalArgumentException {
this(new FileReader(filename));
}
+
+ /**
+ * Constructs a new RegexURLFilter and inits it with a list of rules.
+ * @param rules string with a list of rules, one rule per line
+ * @throws IOException
+ * @throws IllegalArgumentException
+ */
+ public RegexURLFilterBase(String rules) throws IOException,
+ IllegalArgumentException {
+ this(new StringReader(rules));
+ }
/**
* Constructs a new RegexURLFilter and init it with a Reader of rules.
@@ -85,7 +98,7 @@ public abstract class RegexURLFilterBase
*/
protected RegexURLFilterBase(Reader reader)
throws IOException, IllegalArgumentException {
- rules = readRulesFile(reader);
+ rules = readRules(reader);
}
/**
@@ -102,9 +115,9 @@ public abstract class RegexURLFilterBase
* Returns the name of the file of rules to use for
* a particular implementation.
* @param conf is the current configuration.
- * @return the name of the file of rules to use.
+ * @return the name of the resource containing the rules to use.
*/
- protected abstract String getRulesFile(Configuration conf);
+ protected abstract Reader getRulesReader(Configuration conf) throws IOException;
/* -------------------------- *
@@ -132,18 +145,18 @@ public abstract class RegexURLFilterBase
public void setConf(Configuration conf) {
this.conf = conf;
- String file = getRulesFile(conf);
- Reader reader = conf.getConfResourceAsReader(file);
- if (reader == null) {
- if (LOG.isErrorEnabled()) { LOG.error("Can't find resource: " + file); }
- } else {
- try {
- rules = readRulesFile(reader);
- } catch (IOException e) {
- if (LOG.isErrorEnabled()) { LOG.error(e.getMessage()); }
- //TODO mb@media-style.com: throw Exception? Because broken api.
- throw new RuntimeException(e.getMessage(), e);
- }
+ Reader reader = null;
+ try {
+ reader = getRulesReader(conf);
+ } catch (Exception e) {
+ if (LOG.isErrorEnabled()) { LOG.error(e.getMessage()); }
+ throw new RuntimeException(e.getMessage(), e);
+ }
+ try {
+ rules = readRules(reader);
+ } catch (IOException e) {
+ if (LOG.isErrorEnabled()) { LOG.error(e.getMessage()); }
+ throw new RuntimeException(e.getMessage(), e);
}
}
@@ -161,7 +174,7 @@ public abstract class RegexURLFilterBase
* @param reader is a reader of regular expressions rules.
* @return the corresponding {@RegexRule rules}.
*/
- private RegexRule[] readRulesFile(Reader reader)
+ private RegexRule[] readRules(Reader reader)
throws IOException, IllegalArgumentException {
BufferedReader in = new BufferedReader(reader);
Modified: nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java?rev=1025960&r1=1025959&r2=1025960&view=diff
==============================================================================
--- nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java (original)
+++ nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java Thu Oct 21 11:43:37 2010
@@ -19,6 +19,7 @@ package org.apache.nutch.urlfilter.autom
// JDK imports
import java.io.Reader;
import java.io.IOException;
+import java.io.StringReader;
import java.util.regex.PatternSyntaxException;
// Hadoop imports
@@ -41,6 +42,8 @@ import org.apache.nutch.urlfilter.api.Re
* @see <a href="http://www.brics.dk/automaton/">dk.brics.automaton</a>
*/
public class AutomatonURLFilter extends RegexURLFilterBase {
+ public static final String URLFILTER_AUTOMATON_FILE = "urlfilter.automaton.file";
+ public static final String URLFILTER_AUTOMATON_RULES = "urlfilter.automaton.rules";
public AutomatonURLFilter() {
super();
@@ -61,9 +64,17 @@ public class AutomatonURLFilter extends
* <implementation:RegexURLFilterBase> *
* ----------------------------------- */
- // Inherited Javadoc
- protected String getRulesFile(Configuration conf) {
- return conf.get("urlfilter.automaton.file");
+ /**
+ * Rules specified as a config property will override rules specified
+ * as a config file.
+ */
+ protected Reader getRulesReader(Configuration conf) throws IOException {
+ String stringRules = conf.get(URLFILTER_AUTOMATON_RULES);
+ if (stringRules != null) {
+ return new StringReader(stringRules);
+ }
+ String fileRules = conf.get(URLFILTER_AUTOMATON_FILE);
+ return conf.getConfResourceAsReader(fileRules);
}
// Inherited Javadoc
Modified: nutch/trunk/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java?rev=1025960&r1=1025959&r2=1025960&view=diff
==============================================================================
--- nutch/trunk/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java (original)
+++ nutch/trunk/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java Thu Oct 21 11:43:37 2010
@@ -20,6 +20,7 @@ import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
+import java.io.StringReader;
import java.util.LinkedHashSet;
import java.util.Set;
@@ -69,7 +70,7 @@ public class DomainURLFilter
private String domainFile = null;
private Set<String> domainSet = new LinkedHashSet<String>();
- private void readConfigurationFile(Reader configReader)
+ private void readConfiguration(Reader configReader)
throws IOException {
// read the configuration file, line by line
@@ -139,21 +140,24 @@ public class DomainURLFilter
// domain file and attribute "file" take precedence if defined
String file = conf.get("urlfilter.domain.file");
+ String stringRules = conf.get("urlfilter.domain.rules");
if (domainFile != null) {
file = domainFile;
}
else if (attributeFile != null) {
file = attributeFile;
}
-
- // get the file as a classpath resource and populate the domain set with
- // the domains from the file
+ Reader reader = null;
+ if (stringRules != null) { // takes precedence over files
+ reader = new StringReader(stringRules);
+ } else {
+ reader = conf.getConfResourceAsReader(file);
+ }
try {
- Reader reader = conf.getConfResourceAsReader(file);
if (reader == null) {
reader = new FileReader(file);
}
- readConfigurationFile(reader);
+ readConfiguration(reader);
}
catch (IOException e) {
LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
Modified: nutch/trunk/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java?rev=1025960&r1=1025959&r2=1025960&view=diff
==============================================================================
--- nutch/trunk/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java (original)
+++ nutch/trunk/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java Thu Oct 21 11:43:37 2010
@@ -36,6 +36,7 @@ import java.io.FileReader;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.IOException;
+import java.io.StringReader;
import java.util.List;
import java.util.ArrayList;
@@ -63,8 +64,8 @@ public class PrefixURLFilter implements
}
- public PrefixURLFilter(String filename) throws IOException {
- trie = readConfigurationFile(new FileReader(filename));
+ public PrefixURLFilter(String stringRules) throws IOException {
+ trie = readConfiguration(new StringReader(stringRules));
}
public String filter(String url) {
@@ -74,7 +75,7 @@ public class PrefixURLFilter implements
return url;
}
- private TrieStringMatcher readConfigurationFile(Reader reader)
+ private TrieStringMatcher readConfiguration(Reader reader)
throws IOException {
BufferedReader in=new BufferedReader(reader);
@@ -144,16 +145,22 @@ public class PrefixURLFilter implements
}
String file = conf.get("urlfilter.prefix.file");
+ String stringRules = conf.get("urlfilter.prefix.rules");
// attribute "file" takes precedence if defined
if (attributeFile != null)
file = attributeFile;
- Reader reader = conf.getConfResourceAsReader(file);
+ Reader reader = null;
+ if (stringRules != null) { // takes precedence over files
+ reader = new StringReader(stringRules);
+ } else {
+ reader = conf.getConfResourceAsReader(file);
+ }
if (reader == null) {
trie = new PrefixStringMatcher(new String[0]);
} else {
try {
- trie = readConfigurationFile(reader);
+ trie = readConfiguration(reader);
} catch (IOException e) {
if (LOG.isErrorEnabled()) { LOG.error(e.getMessage()); }
// TODO mb@media-style.com: throw Exception? Because broken api.
Modified: nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java?rev=1025960&r1=1025959&r2=1025960&view=diff
==============================================================================
--- nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java (original)
+++ nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java Thu Oct 21 11:43:37 2010
@@ -19,6 +19,7 @@ package org.apache.nutch.urlfilter.regex
// JDK imports
import java.io.Reader;
import java.io.IOException;
+import java.io.StringReader;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
@@ -35,6 +36,9 @@ import org.apache.nutch.util.NutchConfig
* {@link java.util.regex Java Regex implementation}.
*/
public class RegexURLFilter extends RegexURLFilterBase {
+
+ public static final String URLFILTER_REGEX_FILE = "urlfilter.regex.file";
+ public static final String URLFILTER_REGEX_RULES = "urlfilter.regex.rules";
public RegexURLFilter() {
super();
@@ -55,9 +59,17 @@ public class RegexURLFilter extends Rege
* <implementation:RegexURLFilterBase> *
* ----------------------------------- */
- // Inherited Javadoc
- protected String getRulesFile(Configuration conf) {
- return conf.get("urlfilter.regex.file");
+ /**
+ * Rules specified as a config property will override rules specified
+ * as a config file.
+ */
+ protected Reader getRulesReader(Configuration conf) throws IOException {
+ String stringRules = conf.get(URLFILTER_REGEX_RULES);
+ if (stringRules != null) {
+ return new StringReader(stringRules);
+ }
+ String fileRules = conf.get(URLFILTER_REGEX_FILE);
+ return conf.getConfResourceAsReader(fileRules);
}
// Inherited Javadoc
Modified: nutch/trunk/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java?rev=1025960&r1=1025959&r2=1025960&view=diff
==============================================================================
--- nutch/trunk/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java (original)
+++ nutch/trunk/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java Thu Oct 21 11:43:37 2010
@@ -34,6 +34,7 @@ import java.io.FileReader;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.IOException;
+import java.io.StringReader;
import java.util.List;
import java.util.ArrayList;
@@ -139,7 +140,7 @@ public class SuffixURLFilter implements
}
public SuffixURLFilter(Reader reader) throws IOException {
- readConfigurationFile(reader);
+ readConfiguration(reader);
}
public String filter(String url) {
@@ -167,7 +168,7 @@ public class SuffixURLFilter implements
}
}
- public void readConfigurationFile(Reader reader) throws IOException {
+ public void readConfiguration(Reader reader) throws IOException {
// handle missing config file
if (reader == null) {
@@ -269,12 +270,18 @@ public class SuffixURLFilter implements
}
String file = conf.get("urlfilter.suffix.file");
+ String stringRules = conf.get("urlfilter.suffix.rules");
// attribute "file" takes precedence if defined
if (attributeFile != null) file = attributeFile;
- Reader reader = conf.getConfResourceAsReader(file);
+ Reader reader = null;
+ if (stringRules != null) { // takes precedence over files
+ reader = new StringReader(stringRules);
+ } else {
+ reader = conf.getConfResourceAsReader(file);
+ }
try {
- readConfigurationFile(reader);
+ readConfiguration(reader);
} catch (IOException e) {
if (LOG.isErrorEnabled()) { LOG.error(e.getMessage()); }
throw new RuntimeException(e.getMessage(), e);
Modified: nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java?rev=1025960&r1=1025959&r2=1025960&view=diff
==============================================================================
--- nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java (original)
+++ nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java Thu Oct 21 11:43:37 2010
@@ -20,8 +20,11 @@ package org.apache.nutch.net.urlnormaliz
import java.net.URL;
import java.net.MalformedURLException;
import java.io.FileInputStream;
+import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
+import java.io.Reader;
+import java.io.StringReader;
import java.util.Collections;
import java.util.HashMap;
@@ -40,6 +43,7 @@ import org.apache.nutch.util.NutchConfig
import javax.xml.parsers.*;
import org.w3c.dom.*;
+import org.xml.sax.InputSource;
import org.apache.oro.text.regex.*;
/**
@@ -106,17 +110,23 @@ public class RegexURLNormalizer extends
// the default constructor was called
if (this.scopedRules == null) {
String filename = getConf().get("urlnormalizer.regex.file");
+ String stringRules = getConf().get("urlnormalizer.regex.rules");
scopedRules = new HashMap();
- URL url = getConf().getResource(filename);
+ Reader reader = null;
+ if (stringRules != null) {
+ reader = new StringReader(stringRules);
+ } else {
+ reader = getConf().getConfResourceAsReader(filename);
+ }
List rules = null;
- if (url == null) {
- LOG.warn("Can't load the default config file! " + filename);
+ if (reader == null) {
+ LOG.warn("Can't load the default rules! ");
rules = EMPTY_RULES;
} else {
try {
- rules = readConfiguration(url.openStream());
+ rules = readConfiguration(reader);
} catch (Exception e) {
- LOG.warn("Couldn't read default config from '" + url + "': " + e);
+ LOG.warn("Couldn't read default config: " + e);
rules = EMPTY_RULES;
}
}
@@ -125,8 +135,8 @@ public class RegexURLNormalizer extends
}
// used in JUnit test.
- void setConfiguration(InputStream is, String scope) {
- List rules = readConfiguration(is);
+ void setConfiguration(Reader reader, String scope) {
+ List rules = readConfiguration(reader);
scopedRules.put(scope, rules);
LOG.debug("Set config for scope '" + scope + "': " + rules.size() + " rules.");
}
@@ -141,17 +151,16 @@ public class RegexURLNormalizer extends
// try to populate
String configFile = getConf().get("urlnormalizer.regex.file." + scope);
if (configFile != null) {
- URL resource = getConf().getResource(configFile);
- LOG.debug("resource for scope '" + scope + "': " + resource);
- if (resource == null) {
+ LOG.debug("resource for scope '" + scope + "': " + configFile);
+ if (configFile == null) {
LOG.warn("Can't load resource for config file: " + configFile);
} else {
try {
- InputStream is = resource.openStream();
- curRules = readConfiguration(resource.openStream());
+ Reader reader = getConf().getConfResourceAsReader(configFile);
+ curRules = readConfiguration(reader);
scopedRules.put(scope, curRules);
} catch (Exception e) {
- LOG.warn("Couldn't load resource '" + resource + "': " + e);
+ LOG.warn("Couldn't load resource '" + configFile + "': " + e);
}
}
}
@@ -185,22 +194,22 @@ public class RegexURLNormalizer extends
LOG.info("loading " + filename);
}
try {
- FileInputStream fis = new FileInputStream(filename);
- return readConfiguration(fis);
+ FileReader reader = new FileReader(filename);
+ return readConfiguration(reader);
} catch (Exception e) {
LOG.error("Error loading rules from '" + filename + "': " + e);
return EMPTY_RULES;
}
}
- private List readConfiguration(InputStream is) {
+ private List readConfiguration(Reader reader) {
Perl5Compiler compiler = new Perl5Compiler();
List rules = new ArrayList();
try {
// borrowed heavily from code in Configuration.java
Document doc = DocumentBuilderFactory.newInstance().newDocumentBuilder()
- .parse(is);
+ .parse(new InputSource(reader));
Element root = doc.getDocumentElement();
if ((!"regex-normalize".equals(root.getTagName()))
&& (LOG.isErrorEnabled())) {
Modified: nutch/trunk/src/plugin/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java?rev=1025960&r1=1025959&r2=1025960&view=diff
==============================================================================
--- nutch/trunk/src/plugin/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java (original)
+++ nutch/trunk/src/plugin/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java Thu Oct 21 11:43:37 2010
@@ -21,6 +21,7 @@ import java.io.BufferedReader;
import java.io.File;
import java.io.FileFilter;
import java.io.FileInputStream;
+import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
@@ -64,10 +65,10 @@ public class TestRegexURLNormalizer exte
});
for (int i = 0; i < configs.length; i++) {
try {
- FileInputStream fis = new FileInputStream(configs[i]);
+ FileReader reader = new FileReader(configs[i]);
String cname = configs[i].getName();
cname = cname.substring(16, cname.indexOf(".xml"));
- normalizer.setConfiguration(fis, cname);
+ normalizer.setConfiguration(reader, cname);
NormalizedURL[] urls = readTestFile(cname);
testData.put(cname, urls);
} catch (Exception e) {