You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by je...@apache.org on 2006/03/21 23:29:21 UTC

svn commit: r387651 - in /lucene/nutch/trunk/src/plugin/urlfilter-automaton: ./ lib/ sample/ src/ src/java/ src/java/org/ src/java/org/apache/ src/java/org/apache/nutch/ src/java/org/apache/nutch/net/ src/test/ src/test/org/ src/test/org/apache/ src/te...

Author: jerome
Date: Tue Mar 21 14:29:18 2006
New Revision: 387651

URL: http://svn.apache.org/viewcvs?rev=387651&view=rev
Log:
Add an urlfilter based on dk.brics.automaton.

Added:
    lucene/nutch/trunk/src/plugin/urlfilter-automaton/
    lucene/nutch/trunk/src/plugin/urlfilter-automaton/build.xml   (with props)
    lucene/nutch/trunk/src/plugin/urlfilter-automaton/lib/
    lucene/nutch/trunk/src/plugin/urlfilter-automaton/lib/automaton.jar   (with props)
    lucene/nutch/trunk/src/plugin/urlfilter-automaton/plugin.xml   (with props)
    lucene/nutch/trunk/src/plugin/urlfilter-automaton/sample/
    lucene/nutch/trunk/src/plugin/urlfilter-automaton/sample/Benchmarks.rules
    lucene/nutch/trunk/src/plugin/urlfilter-automaton/sample/Benchmarks.urls
    lucene/nutch/trunk/src/plugin/urlfilter-automaton/sample/IntranetCrawling.rules
    lucene/nutch/trunk/src/plugin/urlfilter-automaton/sample/IntranetCrawling.urls
    lucene/nutch/trunk/src/plugin/urlfilter-automaton/sample/WholeWebCrawling.rules
    lucene/nutch/trunk/src/plugin/urlfilter-automaton/sample/WholeWebCrawling.urls
    lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/
    lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/java/
    lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/
    lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/
    lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/
    lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/net/
    lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/net/AutomatonURLFilter.java   (with props)
    lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/net/package.html   (with props)
    lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/test/
    lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/test/org/
    lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/test/org/apache/
    lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/test/org/apache/nutch/
    lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/test/org/apache/nutch/net/
    lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/test/org/apache/nutch/net/TestAutomatonURLFilter.java   (with props)

Added: lucene/nutch/trunk/src/plugin/urlfilter-automaton/build.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-automaton/build.xml?rev=387651&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/urlfilter-automaton/build.xml (added)
+++ lucene/nutch/trunk/src/plugin/urlfilter-automaton/build.xml Tue Mar 21 14:29:18 2006
@@ -0,0 +1,32 @@
+<?xml version="1.0"?>
+
+<project name="urlfilter-automaton" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+  <!-- Build compilation dependencies -->
+  <target name="deps-jar">
+    <ant target="jar" inheritall="false" dir="../lib-regex-filter"/>
+    <ant target="compile-test" inheritall="false" dir="../lib-regex-filter"/>
+  </target>
+
+  <!-- Add compilation dependencies to classpath -->
+  <path id="plugin.deps">
+    <fileset dir="${nutch.root}/build">
+      <include name="**/lib-regex-filter/*.jar" />
+    </fileset>
+    <pathelement location="${nutch.root}/build/lib-regex-filter/test"/>
+  </path>
+
+  <!-- Deploy Unit test dependencies -->
+  <target name="deps-test">
+    <ant target="deploy" inheritall="false" dir="../lib-regex-filter"/>
+  </target>
+
+  <!-- for junit test -->
+  <mkdir dir="${build.test}/data"/>
+  <copy todir="${build.test}/data">
+    <fileset dir="sample" includes="**/*.rules, **/*.urls"/>
+  </copy>
+
+</project>

Propchange: lucene/nutch/trunk/src/plugin/urlfilter-automaton/build.xml
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/nutch/trunk/src/plugin/urlfilter-automaton/lib/automaton.jar
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-automaton/lib/automaton.jar?rev=387651&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/nutch/trunk/src/plugin/urlfilter-automaton/lib/automaton.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/nutch/trunk/src/plugin/urlfilter-automaton/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-automaton/plugin.xml?rev=387651&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/urlfilter-automaton/plugin.xml (added)
+++ lucene/nutch/trunk/src/plugin/urlfilter-automaton/plugin.xml Tue Mar 21 14:29:18 2006
@@ -0,0 +1,25 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<plugin
+   id="urlfilter-automaton"
+   name="Automaton URL Filter"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="urlfilter-automaton.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+   <extension id="org.apache.nutch.net.urlfilter"
+              name="Nutch Automaton URL Filter"
+              point="org.apache.nutch.net.URLFilter">
+      <implementation id="AutomatonURLFilter"
+                      class="org.apache.nutch.net.AutomatonURLFilter"/>
+   </extension>
+
+</plugin>

Propchange: lucene/nutch/trunk/src/plugin/urlfilter-automaton/plugin.xml
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/nutch/trunk/src/plugin/urlfilter-automaton/sample/Benchmarks.rules
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-automaton/sample/Benchmarks.rules?rev=387651&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/urlfilter-automaton/sample/Benchmarks.rules (added)
+++ lucene/nutch/trunk/src/plugin/urlfilter-automaton/sample/Benchmarks.rules Tue Mar 21 14:29:18 2006
@@ -0,0 +1,26 @@
+# The url filter file used by the crawl command.
+
+# Better for intranet crawling.
+# Be sure to change MY.DOMAIN.NAME to your domain name.
+
+# Each non-comment, non-blank line contains a regular expression
+# prefixed by '+' or '-'.  The first matching pattern in the file
+# determines whether a URL is included or ignored.  If no pattern
+# matches, the URL is ignored.
+
+# skip file:, ftp:, & mailto: urls
+-(file|ftp|mailto):.*
+
+# skip image and other suffixes we can't yet parse
+-.*\.(gif|GIF|jpg|JPG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe|png)
+
+# skip URLs containing certain characters as probable queries, etc.
+-.*[?*!@=].*
+
+# skip .fr .org and .net domains
+-.*//.*\.fr/.*
+-.*//.*\.org/.*
+-.*//.*\.net/.*
+
+# skip everything else
++.*

Added: lucene/nutch/trunk/src/plugin/urlfilter-automaton/sample/Benchmarks.urls
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-automaton/sample/Benchmarks.urls?rev=387651&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/urlfilter-automaton/sample/Benchmarks.urls (added)
+++ lucene/nutch/trunk/src/plugin/urlfilter-automaton/sample/Benchmarks.urls Tue Mar 21 14:29:18 2006
@@ -0,0 +1,297 @@
++http://www.hostip.info/
+-http://www.elanceur.org/Articles/OntologieSurfaite.html
++http://www.opensymphony.com/quartz/
+-http://www.portletbridge.org/saxbenchmark/index.html
++http://www.lesmotsdelinfo.com/
++http://usefulinc.com/doap/
++http://www.codezoo.com/
++http://search.infocious.com/
+-http://pedagogie.ac-montpellier.fr/disciplines/anglais/tice/sms.html
++http://www.brics.dk/%7Eamoeller/automaton/
++http://jazzz.com/wp.html
++http://www.maxkiesler.com/index.php
++http://adscriptum.blogspot.com/2006/03/google-et-la-prsentation-deric-schmidt.html
++http://www.alias-i.com/lingpipe/
+-http://johnny.ihackstuff.com/index.php?module=prodreviews
+-http://www.spurl.net/
++http://www.dropload.com/
++http://vivisimo.com/
++http://www.marumushi.com/apps/newsmap/newsmap.cfm
++http://www.ixquick.com/
+-http://today.java.net/pub/a/today/2003/07/30/LuceneIntro.html
++http://www.mail-archive.com/
++http://www.spymac.com/
+-http://browsers.evolt.org/
+-http://www.oswd.org/
++http://www.stayinvisible.com/index.pl
++http://java.sun.com/j2se/1.4.2/docs/api/index.html
++http://www.microsoft.com/resources/documentation/windows/xp/all/proddocs/en-us/ntcmds.mspx
++http://www.bloglines.com/
+-http://www.fckeditor.net/
++http://search.msn.com/
+-http://www.grub.org/
++http://www.xml.com/pub/a/2000/11/29/schemas/part1.html
+-http://www.mnot.net/cache_docs/
+-http://www.furl.net/
++http://www.blogpulse.com/
++http://www.googlefight.com/
++http://www.rokulabs.com/
+-http://mightylegends.zapto.org/dvd/dvdauthor_howto.php
+-http://www.batbox.org/wrt54g-linux.html
+-http://en.wikipedia.org/wiki/%s
++http://www.sipcenter.com/
++http://www.merriampark.com/ld.htm
++http://anon.inf.tu-dresden.de/index_en.html
++http://www.pluck.com/
++http://www.tiddlywiki.com/
++http://www.jux2.com/
++http://clusty.com/
+-http://findability.org/
++http://www.searchengineshowdown.com/
++http://www.nhacks.com/email/index.php
++http://www.koders.com/
++http://www.cs.rochester.edu/sosp2003/papers/p125-ghemawat.pdf
++http://www.gmailwiki.com/index.php/Main_Page
++http://www.tadalist.com/
++http://www.net2ftp.com/
++http://www.streamload.com/
++http://www.lucazappa.com/brilliantMaker/buttonImage.php
++http://www.hybernaut.com/bdv/delicious-import.html
++http://www.gtmcknight.com/buttons/
++http://amb.vis.ne.jp/mozilla/scrapbook/
++http://g-metrics.com/index.php
+-http://tor.eff.org/
++http://www.search-this.com/search_engine_decoder.asp
++http://www.onjava.com/pub/a/onjava/2005/01/26/classloading.html
++http://www.adaptivepath.com/publications/essays/archives/000385.php
+-http://isnoop.net/gmail/
+-http://openweb.eu.org/
++http://www.mistergooddeal.com/
++http://javatoolbox.com/
+-http://www.freenews.fr/
++http://www.wikiwax.com/
+-http://today.java.net/pub/a/today/2005/04/21/farm.html
++http://users.skynet.be/J.Beever/pave.htm
++http://www.lundi8h.com/
++http://www.snap.com/
++http://www.goosee.com/puppy/index.shtml
+-http://www.softwarefreedom.org/index.html
+-http://y.20q.net/
++http://www.bitty.com/
++http://www.lafraise.com/
+-http://www.liquidinformation.org/
++http://www.searchtools.com/
++http://www.martinfowler.com/articles/injection.html
++http://pdos.csail.mit.edu/scigen/
+-http://developer.yahoo.net/blog/
++http://blogger-templates.blogspot.com/
++http://phpadsnew.com/two/
++http://www.langreiter.com/exec/yahoo-vs-google.html
+-http://www.dataparksearch.org/
+-http://www.yubnub.org/
+-http://www.fing.org/
+-http://www.swish-e.org/
+-http://www.openajax.net/wordpress/
++http://crypto.stanford.edu/PwdHash/
++http://www.html-kit.com/favicon/
+-http://today.java.net/pub/a/today/2005/08/09/didyoumean.html?page=1
++http://www.durhamtownship.com/
++http://jiwire.com/
++http://www.insilmaril.de/vym/
+-http://www.spreadshirt.net/
++http://www.goffice.com/
++http://www.writely.com/
++http://www.milindparikh.com/
++http://www.onjava.com/pub/a/onjava/2005/02/02/bitsets.html
++http://www.wikyblog.com/Map/Guest/Home
+-http://www.kottke.org/05/08/googleos-webos
++http://www.rollyo.com/
++http://www.meebo.com/
++http://www.factbites.com/
++http://www.placeopedia.com/
++http://swoogle.umbc.edu/
++http://www.viaduc.com/
+-http://demo.wikiwyg.net/wikiwyg/demo/standalone/
++http://podcasts.yahoo.com/
+-http://beaglewiki.org/Main_Page
++http://yq.search.yahoo.com/
+-http://www.onlamp.com/pub/a/onlamp/2005/10/13/what_is_rails.html?page=1
++http://www.onlamp.com/pub/a/onlamp/2005/10/13/what_is_rails.html
++http://socialight.com/
++http://www.lexxe.com/
++http://www.xom.nu/
++http://www.turboprint.de/
++http://www.whatdoesthatmean.com/index.php/Welcome_to_%27Whatdoesthatmean%3F%27
++http://www.wi-fiplanet.com/tutorials/article.php/3562391
++http://particletree.com/features/10-tips-to-a-better-form/
++http://www.songbirdnest.com/
+-http://www.w3.org/Talks/Tools/Slidy/
+-http://www.compassframework.org/display/SITE/Home
++http://motrech.blogspot.com/
++http://www.moteurzine.com/
++http://www.mex-search.com/
+-http://beta.previewseek.com/?mdc=y&amp;twin=n&amp;ilang=french
++http://www.goshme.com/
++http://rialto.application-servers.com/
++http://www.multe-pass.com/
++http://www.tailrank.com/
++http://www.vandertramp.com/INTERNETDOWN/
++http://www.letterjames.de/index.html
++http://code.google.com/index.html
++http://www.kritx.com/
++http://performancing.com/firefox
++http://www.mywebsearch.com/
+-http://en.wikibooks.org/w/index.php?title=Wikimania05/IM1
++http://www.lukew.com/resources/articles/blogs2.asp
+-http://www.hyperwords.net/
++http://ajax.parish.ath.cx/translator/
++http://www.maplandia.com/
+-http://www.tbray.org/ongoing/When/200x/2006/01/08/No-New-XML-Languages
++http://onefeed.com/index.php
++http://www.file-swap.com/
+-http://opennlp.org/
++http://mindprod.com/jgloss/encoding.html
++http://code.google.com/webstats/index.html
++http://www.freeweb-hosting.com/google_pagerank_pr_checker/
+-http://www.framakey.org/
+-http://microformats.org/wiki/hreview
+-http://www.ashesandsnow.org/index2.html
+-http://uima-framework.sourceforge.net/
++http://sethgodin.typepad.com/seths_blog/2006/01/flipping_the_fu.html
+-http://www.anandtech.com/IT/showdoc.aspx?i=2523&amp;p=2
++http://fr.techcrunch.com/
+-http://developer.yahoo.net/yui/
++http://www.fredrikodman.com/
++http://www.mpirical.com/companion/mpirical_companion.html
++http://www.onjava.com/pub/a/onjava/2005/08/03/drools.html
+-http://k9copy.free.fr/
+-http://lespetitescases.net/comment-organiser-l-information-pour-y-naviguer-efficacement-3
+-http://www.tbray.org/ongoing/When/200x/2006/01/09/On-XML-Language-Design
+-http://lespetitescases.net/structurer-decrire-et-organiser-l-information-2
++http://blogokat.canalblog.com/archives/2005/11/02/882454.html
++http://robur.slu.se/jensl/xmlclitools/
+-http://www.internetactu.net/?p=6291
+-http://www.xml.com/pub/a/2005/10/19/microformats-and-web-2.0.html?page=1
++http://www.memodata.com/2004/fr/alexandria/
+-http://presse-citron.net/?2006/01/23/654-joomla-pete-grave
++http://www.randomerror.com/
++http://www.i-cherubini.it/mauro/blog/2006/01/05/techniques-for-determining-the-location-on-umts-networks/
+-http://fr.newsgator.com/ngs/subscriber/WebEd2.aspx?fid=368395
+-http://interstices.info/display.jsp?id=c_15918
++http://www.tech-invite.com/
++http://www.croczilla.com/zap
+-http://www.libervis.com/modules/wordpress/?p=13
++http://www.searchmorph.com/wp/2005/07/19/recent-discovery-clickfraud-tools/
+-http://savoirscdi.cndp.fr/CulturePro/actualisation/Serres/Serres.htm
++http://www.influo.com/
++http://www.dsi-info.ca/chroniques/chroniques-recherche-web.html
+-http://www.addnb.org/fr/docs/webinvisible.htm
+-http://manhack.net/
+-http://www.jibaku.net/
++http://www.pipologie.com/
++http://christophenoel.blogspot.com/
+-http://www.seekport.fr/seekbot/
++http://beta.exalead.com/
+-http://www.boolgum.fr/index.html
++http://www.kesako.canalblog.com/
++http://loran.blogspot.com/
++http://outils-recherche.blogspot.com/
++http://www.art-dept.com/artists/giacobbe/
++http://www.meggould.netfirms.com/site_seeingIII.htm
++http://www.freedpi.com/
++http://www.frenchfred.com/
++http://www.photoways.com/
+-http://freco.free.fr/index.htm
+-http://triturages.free.fr/index.htm
+-http://www.qsos.org/
++http://www.alvis.info/alvis/
++http://www.i-cherubini.it/mauro/blog/2005/12/16/open-source-information-retrieval-systems/
+-http://www.shinux.org/
++http://www.linuxlinks.com/Distributions/Mini_Distributions/index.shtml
++http://www.kurobox.com/online/tiki-index.php
+-http://news.gmane.org/gmane.comp.misc.linkstation.linux
++http://www.imsbook.com/SIP-IMS-Standards-List.html
+-http://incubator.apache.org/directory/subprojects/snickers/
+-http://www.mozilla.org/projects/security/pki/jss/javadoc/org/mozilla/jss/asn1/package-summary.html
+-http://sourceforge.net/projects/cryptix-asn1/
+-http://sourceforge.net/projects/basn/
+-http://asn1.elibel.tm.fr/fr/index.htm
+-http://sourceforge.net/projects/a2j/
++http://www.degrouptest.com/
++http://interstices.info/
++http://louvre-boite.viabloga.com/news/18.shtml
+-http://tel.ccsd.cnrs.fr/documents/archives0/00/00/62/60/index_fr.html
++http://poiplace.oabsoftware.nl/
+-http://www.gpspassion.com/forumsen/topic.asp?TOPIC_ID=7759
+-http://www.yoono.com/favorites.jsp?user-id=lquerel
+-http://www.librecours.org/cgi-bin/main
+-http://www.onjava.com/pub/a/onjava/2006/01/18/using-lucene-to-search-java-source.html?page=1
+-http://limo.sourceforge.net/
++http://www-scf.usc.edu/%7Emattmann/
++http://spaces.msn.com/members/famillezen/
+-http://photos.joune.org/
+-http://www.canon.fr/paperart/
++http://flash.eastweb.ru/files/20051024092150.swf
++http://www.xsltwiki.com/index.php/Main_Page
++http://www.i-cherubini.it/mauro/blog/2005/12/08/software-that-goes-on-a-stick/
+-http://www.webrankinfo.com/forums/forum_15.htm?sid=307384cdbce813aa19ba017513cbbc31
++http://www.loiclemeur.com/france/2006/01/eric_tenin_se_f.html
+-http://member.openmobilealliance.org/ftp/Public_documents/MCC/2005/
++http://www.aeliosfinance.com/
++http://www.capital-it.com/
+-http://www.tradedoubler.fr/pan/public/solutions/publisher
+-http://www.recherche.gouv.fr/technologie/concours/2006/index.htm
++http://www.techcrunch.com/2005/12/21/gravee-takes-a-new-approach-to-search/
++http://wanabo.com/
+-http://www.lespetitescases.net/structurer-decrire-et-organiser-l-information-1
+-http://presse-citron.net/?2006/02/07/705-joue-la-comme-stickam
++http://aeliosfinance.com/
++http://www.centreincubation.com/
++http://www.franceincubation.com/
+-http://www.oseo.fr/
++http://www.i18nfaq.com/chardet.html
+-http://cpdetector.sourceforge.net/
++http://www.jeremi.info/index.php/2005/07/21/7-introduction-aux-methodes-agiles
++http://chezlorry.ca/Accueil.htm
++http://cetnia.blogs.com/d_lires/
+-http://www.directwine.fr/
++http://www.new-phenix.com/
+-http://upnp.sourceforge.net/
+-http://www.pixmania.fr/
+-http://www.lespetitescases.net/comment-organiser-l-information-pour-y-naviguer-efficacement-3
++http://www.i-cherubini.it/mauro/blog/2006/01/25/kwmap-a-keyword-search-visualization-tool/
++http://www.stepnewz.com/sn/default.asp
++http://opquast.com/
+-http://www.freeplayer.org/
+-http://www.cafe-clope.net/orangeamere/index.php/2005/08/24/5-le-modele-contributif-une-utopie
+-http://atomcomputer.free.fr/fbox/
+-http://www.internetactu.net/index.php?p=6100
+-http://mammouthland.free.fr/cours/css/genecss.php
+-http://www.xml.com/pub/a/2006/02/01/doing-http-caching-right-introducing-httplib2.html?page=1
++http://www-106.ibm.com/developerworks/xml/library/x-xapi.html
+-http://xml.apache.org/xalan-j/extensions.html
++http://developers.sun.com/foryourbusiness/jcc/
++http://blogs.sun.com/roller/page/roumen/Weblog
+-http://www.onjava.com/pub/a/onjava/2005/10/12/diagnostic-tests-with-ant.html?page=1
+-http://blog.developpez.com/index.php?blog=51&amp;p=1389&amp;more=1&amp;c=1&amp;tb=1&amp;pb=1
++http://dcabasson.developpez.com/articles/javascript/ajax/ajax-autocompletion-pas-a-pas/
++http://odur.let.rug.nl/%7Evannoord/
+-http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html
+-http://artist.inist.fr/
++http://www.elra.info/
+-http://beinecke.library.yale.edu/dl_crosscollex/SearchExecXC.asp?srchtype=CNO
++http://www.i-cherubini.it/mauro/blog/2005/12/13/information-retrieval-system-evaluation-effort-sensitivity-and-reliability
++http://www.i-cherubini.it/mauro/blog/2005/12/13/trec-datasets-text-retrieval-conference-datasets-for-information-retrieval
++http://www.i-cherubini.it/mauro/blog/2005/12/12/focused-crawling-using-context-graphs/
++http://www.i-cherubini.it/mauro/blog/2005/12/08/spam-filtering-using-contextual-network-graphs/
++http://www.cs.northwestern.edu/%7Evidya/semanticons/IconsWebPage/
++http://www.i-cherubini.it/mauro/blog/2006/01/05/social-information-retrieval/
++http://www.i-cherubini.it/mauro/blog/2006/01/04/an-introduction-to-random-indexing/
++http://dossierdoc.typepad.com/descripteurs/2006/01/liste_de_thsaur.html
+-http://www.lexique.org/
++http://www.i-cherubini.it/mauro/blog/2006/01/22/montylingua-a-commonsense-enriched-part-of-speech-tagger/
++http://www.streamium.com/products/mx6000i/
+-http://www.p4c.philips.com/cgi-bin/dcbint/cpindex.pl?ctn=MX6000I/22S&amp;scy=FR&amp;slg=fr
+-http://store.interact-tv.com/store/product_info.php?cPath=9&amp;products_id=73
++http://www.tversity.com/
+-http://www.aspseek.org/index.php
\ No newline at end of file

Added: lucene/nutch/trunk/src/plugin/urlfilter-automaton/sample/IntranetCrawling.rules
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-automaton/sample/IntranetCrawling.rules?rev=387651&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/urlfilter-automaton/sample/IntranetCrawling.rules (added)
+++ lucene/nutch/trunk/src/plugin/urlfilter-automaton/sample/IntranetCrawling.rules Tue Mar 21 14:29:18 2006
@@ -0,0 +1,24 @@
+# The url filter file used by the crawl command.
+
+# Better for intranet crawling.
+# Be sure to change MY.DOMAIN.NAME to your domain name.
+
+# Each non-comment, non-blank line contains a regular expression
+# prefixed by '+' or '-'.  The first matching pattern in the file
+# determines whether a URL is included or ignored.  If no pattern
+# matches, the URL is ignored.
+
+# skip file:, ftp:, & mailto: urls
+-(file|ftp|mailto):.*
+
+# skip image and other suffixes we can't yet parse
+-.*\.(gif|GIF|jpg|JPG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe|png)
+
+# skip URLs containing certain characters as probable queries, etc.
+-.*[?*!@=].*
+
+# accept hosts in MY.DOMAIN.NAME
++http://([a-z0-9]*\.)*MY.DOMAIN.NAME/.*
+
+# skip everything else
+-.*

Added: lucene/nutch/trunk/src/plugin/urlfilter-automaton/sample/IntranetCrawling.urls
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-automaton/sample/IntranetCrawling.urls?rev=387651&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/urlfilter-automaton/sample/IntranetCrawling.urls (added)
+++ lucene/nutch/trunk/src/plugin/urlfilter-automaton/sample/IntranetCrawling.urls Tue Mar 21 14:29:18 2006
@@ -0,0 +1,8 @@
+-file://home/jc/nutch/index.html
+-ftp://ftp.apache.org/nutch.html
+-mailto:jerome.charron@gmail.com
+-news://any.news.server/comp.lang.java
+-whois:/nutch.org
++http://MY.DOMAIN.NAME/
++http://MY.DOMAIN.NAME/nutch
++http://www.MY.DOMAIN.NAME/

Added: lucene/nutch/trunk/src/plugin/urlfilter-automaton/sample/WholeWebCrawling.rules
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-automaton/sample/WholeWebCrawling.rules?rev=387651&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/urlfilter-automaton/sample/WholeWebCrawling.rules (added)
+++ lucene/nutch/trunk/src/plugin/urlfilter-automaton/sample/WholeWebCrawling.rules Tue Mar 21 14:29:18 2006
@@ -0,0 +1,19 @@
+# The default url filter.
+# Better for whole-internet crawling.
+
+# Each non-comment, non-blank line contains a regular expression
+# prefixed by '+' or '-'.  The first matching pattern in the file
+# determines whether a URL is included or ignored.  If no pattern
+# matches, the URL is ignored.
+
+# skip file: ftp: and mailto: urls
+-(file|ftp|mailto):.*
+
+# skip image and other suffixes we can't yet parse
+-.*\.(gif|GIF|jpg|JPG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe)
+
+# skip URLs containing certain characters as probable queries, etc.
+-.*[?*!@=].*
+
+# accept anything else
++.*

Added: lucene/nutch/trunk/src/plugin/urlfilter-automaton/sample/WholeWebCrawling.urls
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-automaton/sample/WholeWebCrawling.urls?rev=387651&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/urlfilter-automaton/sample/WholeWebCrawling.urls (added)
+++ lucene/nutch/trunk/src/plugin/urlfilter-automaton/sample/WholeWebCrawling.urls Tue Mar 21 14:29:18 2006
@@ -0,0 +1,11 @@
+-file://home/jc/nutch/index.html
+-ftp://ftp.apache.org/nutch.html
+-mailto:jerome.charron@gmail.com
++news://any.news.server/comp.lang.java
++whois:/nutch.org
+-http://www.nutch.org/nutch.gif
+-http://www.nutch.org/nutch.eps
+-http://www.nutch.org/nutch?q=nutch
++http://www.nutch.org/
++http://www.nutch.org/abcd/foo/bar/foo/bar/foo/
++http://www.nutch.org/abcd/foo/bar/xyz/foo/bar/foo/

Added: lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/net/AutomatonURLFilter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/net/AutomatonURLFilter.java?rev=387651&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/net/AutomatonURLFilter.java (added)
+++ lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/net/AutomatonURLFilter.java Tue Mar 21 14:29:18 2006
@@ -0,0 +1,94 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.net;
+
+// JDK imports
+import java.io.Reader;
+import java.io.IOException;
+import java.util.regex.PatternSyntaxException;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+
+// Automaton imports
+import dk.brics.automaton.RegExp;
+import dk.brics.automaton.RunAutomaton;
+
+
+/**
+ * RegexURLFilterBase implementation based on the
+ * <a href="http://www.brics.dk/automaton/">dk.brics.automaton</a>
+ * Finite-State Automata for Java<sup>TM</sup>.
+ *
+ * @author J&eacute;r&ocirc;me Charron
+ * @see <a href="http://www.brics.dk/automaton/">dk.brics.automaton</a>
+ */
+public class AutomatonURLFilter extends RegexURLFilterBase {
+
+  public AutomatonURLFilter() {
+    super();
+  }
+
+  public AutomatonURLFilter(String filename)
+    throws IOException, PatternSyntaxException {
+    super(filename);
+  }
+
+  AutomatonURLFilter(Reader reader)
+    throws IOException, IllegalArgumentException {
+    super(reader);
+  }
+
+  
+  /* ----------------------------------- *
+   * <implementation:RegexURLFilterBase> *
+   * ----------------------------------- */
+  
+  // Inherited Javadoc
+  protected String getRulesFile(Configuration conf) {
+    return conf.get("urlfilter.automaton.file");
+  }
+
+  // Inherited Javadoc
+  protected RegexRule createRule(boolean sign, String regex) {
+    return new Rule(sign, regex);
+  }
+  
+  /* ------------------------------------ *
+   * </implementation:RegexURLFilterBase> *
+   * ------------------------------------ */
+
+  
+  public static void main(String args[]) throws IOException {
+    main(new AutomatonURLFilter(), args);
+  }
+
+
+  private class Rule extends RegexRule {
+    
+    private RunAutomaton automaton;
+    
+    Rule(boolean sign, String regex) {
+      super(sign, regex);
+      automaton = new RunAutomaton(new RegExp(regex, RegExp.ALL).toAutomaton());
+    }
+
+    protected boolean match(String url) {
+      return automaton.run(url);
+    }
+  }
+  
+}

Propchange: lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/net/AutomatonURLFilter.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/net/package.html
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/net/package.html?rev=387651&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/net/package.html (added)
+++ lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/net/package.html Tue Mar 21 14:29:18 2006
@@ -0,0 +1,9 @@
+<html>
+<body>
+<p>
+A url filter plugin based on
+<a href="http://www.brics.dk/automaton/">dk.brics.automaton</a> Finite-State
+Automata for Java<sup>TM</sup>.
+</p>
+</body>
+</html>

Propchange: lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/net/package.html
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/test/org/apache/nutch/net/TestAutomatonURLFilter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/test/org/apache/nutch/net/TestAutomatonURLFilter.java?rev=387651&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/test/org/apache/nutch/net/TestAutomatonURLFilter.java (added)
+++ lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/test/org/apache/nutch/net/TestAutomatonURLFilter.java Tue Mar 21 14:29:18 2006
@@ -0,0 +1,66 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.net;
+
+// JDK imports
+import java.io.IOException;
+import java.io.Reader;
+
+// JUnit imports
+import junit.framework.Test;
+import junit.framework.TestSuite;
+import junit.textui.TestRunner;
+
+
+/**
+ * JUnit based test of class <code>AutomatonURLFilter</code>.
+ *
+ * @author J&eacute;r&ocirc;me Charron
+ */
+public class TestAutomatonURLFilter extends RegexURLFilterBaseTest {
+  
+  public TestAutomatonURLFilter(String testName) {
+    super(testName);
+  }
+  
+  public static Test suite() {
+    return new TestSuite(TestAutomatonURLFilter.class);
+  }
+  
+  public static void main(String[] args) {
+    TestRunner.run(suite());
+  }
+
+  protected URLFilter getURLFilter(Reader rules) {
+    try {
+      return new AutomatonURLFilter(rules);
+    } catch (IOException e) {
+      fail(e.toString());
+      return null;
+    }
+  }
+  
+  public void test() {
+    test("WholeWebCrawling");
+    test("IntranetCrawling");
+    bench(50, "Benchmarks");
+    bench(100, "Benchmarks");
+    bench(200, "Benchmarks");
+    bench(400, "Benchmarks");
+    bench(800, "Benchmarks");
+  }
+
+}

Propchange: lucene/nutch/trunk/src/plugin/urlfilter-automaton/src/test/org/apache/nutch/net/TestAutomatonURLFilter.java
------------------------------------------------------------------------------
    svn:eol-style = native