You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by si...@apache.org on 2006/11/21 19:38:11 UTC

svn commit: r477806 - in /lucene/nutch/trunk: CHANGES.txt conf/crawl-urlfilter.txt.template conf/regex-urlfilter.txt.template conf/suffix-urlfilter.txt

Author: siren
Date: Tue Nov 21 10:38:10 2006
New Revision: 477806

URL: http://svn.apache.org/viewvc?view=rev&rev=477806
Log:
NUTCH-305

Modified:
    lucene/nutch/trunk/CHANGES.txt
    lucene/nutch/trunk/conf/crawl-urlfilter.txt.template
    lucene/nutch/trunk/conf/regex-urlfilter.txt.template
    lucene/nutch/trunk/conf/suffix-urlfilter.txt

Modified: lucene/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=477806&r1=477805&r2=477806
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Tue Nov 21 10:38:10 2006
@@ -85,6 +85,10 @@
 
 28. NUTCH-362 - Remove parse-text from unsupported filetypes in
     parse-plugins.xml (siren)
+    
+29. NUTCH-305 - Update crawl and url filter lists to exclude
+    jpeg|JPEG|bmp|BMP, suffix-urlfilter.txt (contributed by Stefan
+    Neufeind) is also updated (siren)
 
 Release 0.8 - 2006-07-25
 

Modified: lucene/nutch/trunk/conf/crawl-urlfilter.txt.template
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/crawl-urlfilter.txt.template?view=diff&rev=477806&r1=477805&r2=477806
==============================================================================
--- lucene/nutch/trunk/conf/crawl-urlfilter.txt.template (original)
+++ lucene/nutch/trunk/conf/crawl-urlfilter.txt.template Tue Nov 21 10:38:10 2006
@@ -12,7 +12,7 @@
 -^(file|ftp|mailto):
 
 # skip image and other suffixes we can't yet parse
--\.(gif|GIF|jpg|JPG|png|PNG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe)$
+-\.(gif|GIF|jpg|JPG|png|PNG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe|jpeg|JPEG|bmp|BMP)$
 
 # skip URLs containing certain characters as probable queries, etc.
 -[?*!@=]

Modified: lucene/nutch/trunk/conf/regex-urlfilter.txt.template
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/regex-urlfilter.txt.template?view=diff&rev=477806&r1=477805&r2=477806
==============================================================================
--- lucene/nutch/trunk/conf/regex-urlfilter.txt.template (original)
+++ lucene/nutch/trunk/conf/regex-urlfilter.txt.template Tue Nov 21 10:38:10 2006
@@ -10,7 +10,7 @@
 -^(file|ftp|mailto):
 
 # skip image and other suffixes we can't yet parse
--\.(gif|GIF|jpg|JPG|png|PNG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe)$
+-\.(gif|GIF|jpg|JPG|png|PNG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe|jpeg|JPEG|bmp|BMP)$
 
 # skip URLs containing certain characters as probable queries, etc.
 -[?*!@=]

Modified: lucene/nutch/trunk/conf/suffix-urlfilter.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/suffix-urlfilter.txt?view=diff&rev=477806&r1=477805&r2=477806
==============================================================================
--- lucene/nutch/trunk/conf/suffix-urlfilter.txt (original)
+++ lucene/nutch/trunk/conf/suffix-urlfilter.txt Tue Nov 21 10:38:10 2006
@@ -3,9 +3,87 @@
 # case-insensitive, allow unknown suffixes
 +I
 
-# prohibit these
+### prohibit these
+# pictures
 .gif
 .jpg
 .jpeg
 .bmp
 .png
+.tif
+.tiff
+.ico
+.eps
+.ps
+.wmf
+.fpx
+.cur
+.ani
+.img
+.lwf
+.pcd
+.psp
+.psd
+.tga
+.xbm
+.xpm
+
+# web-formats
+.css
+
+# archives/packages
+.arj
+.arc
+.7z
+.cab
+.lzw
+.lha
+.lzh
+.zip
+.gz
+.tar
+.tgz
+.sit
+.rpm
+.deb
+.pkg
+
+# audio/video
+.mid
+.midi
+.rmi
+.mpeg
+.mpg
+.mpe
+.mp3
+.mp2
+.aac
+.mov
+.fla
+.flv
+.ra
+.ram
+.rm
+.rmv
+.wma
+.wmv
+.wav
+.wave
+.ogg
+.avi
+.au
+.snd
+
+# executables
+.exe
+.com
+
+# windows links
+.lnk
+
+# typo3-extensions
+.t3x
+
+# disc-images
+.iso
+.bin