You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2019/01/29 10:31:43 UTC
[nutch] branch master updated: NUTCH-2689 Speed up urlfilter-regex
and urlfilter-automaton - do not extract host and domain name from the URL
if not needed - speed up regular expressions: - use non-capturing groups if
possible - use (?i) to make the patterns case insensitiven and remove
uppercase variants to keep alternations shorter
This is an automated email from the ASF dual-hosted git repository.
snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push:
new f87b19b NUTCH-2689 Speed up urlfilter-regex and urlfilter-automaton - do not extract host and domain name from the URL if not needed - speed up regular expressions: - use non-capturing groups if possible - use (?i) to make the patterns case insensitiven and remove uppercase variants to keep alternations shorter
new da8f3f5 Merge pull request #432 from sebastian-nagel/NUTCH-2689-urlfilter-regex-speed-up
f87b19b is described below
commit f87b19b0ee8a01c5f54f5ed4b6b159169705682f
Author: Sebastian Nagel <sn...@apache.org>
AuthorDate: Tue Jan 22 14:45:29 2019 +0100
NUTCH-2689 Speed up urlfilter-regex and urlfilter-automaton
- do not extract host and domain name from the URL if not needed
- speed up regular expressions:
- use non-capturing groups if possible
- use (?i) to make the patterns case insensitiven and
remove uppercase variants to keep alternations shorter
---
conf/regex-urlfilter.txt.template | 4 +--
.../nutch/urlfilter/api/RegexURLFilterBase.java | 40 +++++++++++++---------
src/plugin/urlfilter-regex/sample/Benchmarks.rules | 12 +++----
.../urlfilter-regex/sample/IntranetCrawling.rules | 6 ++--
.../urlfilter-regex/sample/WholeWebCrawling.rules | 4 +--
5 files changed, 37 insertions(+), 29 deletions(-)
diff --git a/conf/regex-urlfilter.txt.template b/conf/regex-urlfilter.txt.template
index 4319bf1..1448642 100644
--- a/conf/regex-urlfilter.txt.template
+++ b/conf/regex-urlfilter.txt.template
@@ -24,14 +24,14 @@
# matches, the URL is ignored.
# skip file: ftp: and mailto: urls
--^(file|ftp|mailto):
+-^(?:file|ftp|mailto):
# skip URLs longer than 2048 characters, see also db.max.outlink.length
#-^.{2049,}
# skip image and other suffixes we can't yet parse
# for a more extensive coverage use the urlfilter-suffix plugin
--(?i)\.(gif|jpg|png|ico|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|exe|jpeg|bmp|js)$
+-(?i)\.(?:gif|jpg|png|ico|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|exe|jpeg|bmp|js)$
# skip URLs containing certain characters as probable queries, etc.
-[?*!@=]
diff --git a/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java b/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
index ecbe29d..993b37d 100644
--- a/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
+++ b/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
@@ -69,6 +69,14 @@ public abstract class RegexURLFilterBase implements URLFilter {
private Configuration conf;
/**
+ * Whether there are host- or domain-specific rules. If there are no specific
+ * rules host and domain name are not extracted from the URL to speed up the
+ * matching. {@link #readRules(Reader)} automatically sets this to true if
+ * host- or domain-specific rules are used in the rule file.
+ */
+ protected boolean hasHostDomainRules = false;
+
+ /**
* Constructs a new empty RegexURLFilterBase
*/
public RegexURLFilterBase() {
@@ -154,34 +162,33 @@ public abstract class RegexURLFilterBase implements URLFilter {
// Inherited Javadoc
public String filter(String url) {
- String host = URLUtil.getHost(url);
+ String host = null;
String domain = null;
-
- try {
- domain = URLUtil.getDomainName(url);
- } catch (MalformedURLException e) {
- // shouldnt happen here right?
+
+ if (hasHostDomainRules) {
+ host = URLUtil.getHost(url);
+ try {
+ domain = URLUtil.getDomainName(url);
+ } catch (MalformedURLException e) {
+ // shouldnt happen here right?
+ }
+
+ LOG.debug("URL belongs to host {} and domain {}", host, domain);
}
- if (LOG.isDebugEnabled()) {
- LOG.debug("URL belongs to host " + host + " and domain " + domain);
- }
-
for (RegexRule rule : rules) {
// Skip the skip for rules that don't share the same host and domain
if (rule.hostOrDomain() != null &&
!rule.hostOrDomain().equals(host) &&
!rule.hostOrDomain().equals(domain)) {
- if (LOG.isDebugEnabled()) {
- LOG.debug("Skipping rule [" + rule.regex() + "] for host: " + rule.hostOrDomain());
- }
+ LOG.debug("Skipping rule [{}] for host: {}", rule.regex(),
+ rule.hostOrDomain());
continue;
}
- if (LOG.isDebugEnabled()) {
- LOG.debug("Applying rule [" + rule.regex() + "] for host: " + host + " and domain " + domain);
- }
+ LOG.debug("Applying rule [{}] for host {} and domain {}", rule.regex(),
+ host, domain);
if (rule.match(url)) {
return rule.accept() ? url : null;
@@ -265,6 +272,7 @@ public abstract class RegexURLFilterBase implements URLFilter {
continue;
case '>':
hostOrDomain = line.substring(1).trim();
+ hasHostDomainRules = true;
continue;
case '<':
hostOrDomain = null;
diff --git a/src/plugin/urlfilter-regex/sample/Benchmarks.rules b/src/plugin/urlfilter-regex/sample/Benchmarks.rules
index c8901e2..6a85118 100644
--- a/src/plugin/urlfilter-regex/sample/Benchmarks.rules
+++ b/src/plugin/urlfilter-regex/sample/Benchmarks.rules
@@ -9,18 +9,18 @@
# matches, the URL is ignored.
# skip file:, ftp:, & mailto: urls
--^(file|ftp|mailto):
+-^(?:file|ftp|mailto):
# skip image and other suffixes we can't yet parse
--\.(gif|GIF|jpg|JPG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe|png)$
+-(?i)\.(?:gif|jpg|ico|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|exe|png)$
# skip URLs containing certain characters as probable queries, etc.
-[?*!@=]
# skip .fr .org and .net domains
--^.*//.*\.fr/
--^.*//.*\.org/
--^.*//.*\.net/
+-^[^/]*//[^/]*\.fr/
+-^[^/]*//[^/]*\.org/
+-^[^/]*//[^/]*\.net/
-# skip everything else
+# accept everything else
+.
diff --git a/src/plugin/urlfilter-regex/sample/IntranetCrawling.rules b/src/plugin/urlfilter-regex/sample/IntranetCrawling.rules
index 705bdb2..e651dd5 100644
--- a/src/plugin/urlfilter-regex/sample/IntranetCrawling.rules
+++ b/src/plugin/urlfilter-regex/sample/IntranetCrawling.rules
@@ -9,10 +9,10 @@
# matches, the URL is ignored.
# skip file:, ftp:, & mailto: urls
--^(file|ftp|mailto):
+-^(?:file|ftp|mailto):
# skip image and other suffixes we can't yet parse
--\.(gif|GIF|jpg|JPG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe|png)$
+-(?i)\.(?:gif|jpg|ico|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|exe|png)$
# skip URLs containing certain characters as probable queries, etc.
-[?*!@=]
@@ -21,7 +21,7 @@
-.*(/.+?)/.*?\1/.*?\1/
# accept hosts in MY.DOMAIN.NAME
-+^http://([a-z0-9]*\.)*MY.DOMAIN.NAME/
++^https?://(?:[a-z0-9]*\.)*MY.DOMAIN.NAME/
# skip everything else
-.
diff --git a/src/plugin/urlfilter-regex/sample/WholeWebCrawling.rules b/src/plugin/urlfilter-regex/sample/WholeWebCrawling.rules
index 8778921..ac9ad60 100644
--- a/src/plugin/urlfilter-regex/sample/WholeWebCrawling.rules
+++ b/src/plugin/urlfilter-regex/sample/WholeWebCrawling.rules
@@ -7,10 +7,10 @@
# matches, the URL is ignored.
# skip file: ftp: and mailto: urls
--^(file|ftp|mailto):
+-^(?:file|ftp|mailto):
# skip image and other suffixes we can't yet parse
--\.(gif|GIF|jpg|JPG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe)$
+-(?i)\.(?:gif|jpg|ico|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|exe)$
# skip URLs containing certain characters as probable queries, etc.
-[?*!@=]