You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ku...@apache.org on 2007/03/09 23:41:24 UTC
svn commit: r516592 - in /lucene/nutch/trunk/conf:
crawl-urlfilter.txt.template regex-urlfilter.txt.template
Author: kubes
Date: Fri Mar 9 14:41:24 2007
New Revision: 516592
URL: http://svn.apache.org/viewvc?view=rev&rev=516592
Log:
NUTCH-233 resolved. Patch supplied by Stefan Groschupf. Thanks Stefan.
Modified:
lucene/nutch/trunk/conf/crawl-urlfilter.txt.template
lucene/nutch/trunk/conf/regex-urlfilter.txt.template
Modified: lucene/nutch/trunk/conf/crawl-urlfilter.txt.template
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/crawl-urlfilter.txt.template?view=diff&rev=516592&r1=516591&r2=516592
==============================================================================
--- lucene/nutch/trunk/conf/crawl-urlfilter.txt.template (original)
+++ lucene/nutch/trunk/conf/crawl-urlfilter.txt.template Fri Mar 9 14:41:24 2007
@@ -34,7 +34,7 @@
-[?*!@=]
# skip URLs with slash-delimited segment that repeats 3+ times, to break loops
--.*(/.+?)/.*?\1/.*?\1/
+-.*(/[^/]+)/[^/]+\1/[^/]+\1/
# accept hosts in MY.DOMAIN.NAME
+^http://([a-z0-9]*\.)*MY.DOMAIN.NAME/
Modified: lucene/nutch/trunk/conf/regex-urlfilter.txt.template
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/regex-urlfilter.txt.template?view=diff&rev=516592&r1=516591&r2=516592
==============================================================================
--- lucene/nutch/trunk/conf/regex-urlfilter.txt.template (original)
+++ lucene/nutch/trunk/conf/regex-urlfilter.txt.template Fri Mar 9 14:41:24 2007
@@ -32,7 +32,7 @@
-[?*!@=]
# skip URLs with slash-delimited segment that repeats 3+ times, to break loops
--.*(/.+?)/.*?\1/.*?\1/
+-.*(/[^/]+)/[^/]+\1/[^/]+\1/
# accept anything else
+.