You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by cu...@apache.org on 2005/07/18 22:43:52 UTC

svn commit: r219563 - in /lucene/nutch/branches/mapred/conf: crawl-urlfilter.txt.template regex-urlfilter.txt.template

Author: cutting
Date: Mon Jul 18 13:42:37 2005
New Revision: 219563

URL: http://svn.apache.org/viewcvs?rev=219563&view=rev
Log:
Skip URLs with repeating segments.

Modified:
    lucene/nutch/branches/mapred/conf/crawl-urlfilter.txt.template
    lucene/nutch/branches/mapred/conf/regex-urlfilter.txt.template

Modified: lucene/nutch/branches/mapred/conf/crawl-urlfilter.txt.template
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/conf/crawl-urlfilter.txt.template?rev=219563&r1=219562&r2=219563&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/conf/crawl-urlfilter.txt.template (original)
+++ lucene/nutch/branches/mapred/conf/crawl-urlfilter.txt.template Mon Jul 18 13:42:37 2005
@@ -17,6 +17,9 @@
 # skip URLs containing certain characters as probable queries, etc.
 -[?*!@=]
 
+# skip URLs with slash-delimited segment that repeats 3+ times, to break loops
+-.*(/.+?)/.*?\1/.*?\1/
+
 # accept hosts in MY.DOMAIN.NAME
 +^http://([a-z0-9]*\.)*MY.DOMAIN.NAME/
 

Modified: lucene/nutch/branches/mapred/conf/regex-urlfilter.txt.template
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/conf/regex-urlfilter.txt.template?rev=219563&r1=219562&r2=219563&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/conf/regex-urlfilter.txt.template (original)
+++ lucene/nutch/branches/mapred/conf/regex-urlfilter.txt.template Mon Jul 18 13:42:37 2005
@@ -15,5 +15,8 @@
 # skip URLs containing certain characters as probable queries, etc.
 -[?*!@=]
 
+# skip URLs with slash-delimited segment that repeats 3+ times, to break loops
+-.*(/.+?)/.*?\1/.*?\1/
+
 # accept anything else
 +.