You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@lucene.apache.org by ot...@apache.org on 2002/09/15 02:38:14 UTC

cvs commit: jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher FetcherMain.java

otis        2002/09/14 17:38:14

  Modified:    contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher
                        FetcherMain.java
  Log:
  - Made constructor private, added a few FIXMEs, etc.
  
  Revision  Changes    Path
  1.6       +30 -36    jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherMain.java
  
  Index: FetcherMain.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherMain.java,v
  retrieving revision 1.5
  retrieving revision 1.6
  diff -u -r1.5 -r1.6
  --- FetcherMain.java	18 Jun 2002 00:45:10 -0000	1.5
  +++ FetcherMain.java	15 Sep 2002 00:38:14 -0000	1.6
  @@ -56,16 +56,16 @@
   
   import de.lanlab.larm.threads.ThreadPoolObserver;
   import de.lanlab.larm.threads.ThreadPool;
  -import java.net.MalformedURLException;
  -import java.net.URL;
  -import java.util.*;
   import de.lanlab.larm.gui.*;
   import de.lanlab.larm.util.*;
   import de.lanlab.larm.storage.*;
   import de.lanlab.larm.net.*;
  -import javax.swing.UIManager;
   import HTTPClient.*;
   import org.apache.oro.text.regex.MalformedPatternException;
  +import java.net.MalformedURLException;
  +import java.net.URL;
  +import java.util.*;
  +import javax.swing.UIManager;
   
   
   /**
  @@ -110,11 +110,6 @@
       protected RobotExclusionFilter reFilter;
   
       /**
  -     * the host manager keeps track of all hosts and is used by the filters.
  -     */
  -    protected HostManager hostManager;
  -
  -    /**
        * this rather flaky filter just filters out some URLs, i.e. different views
        * of Apache the apache DirIndex module. Has to be made
        * configurable in near future
  @@ -122,18 +117,27 @@
       protected KnownPathsFilter knownPathsFilter;
   
       /**
  +     * the URL length filter filters URLs that are too long, i.e. because of errors
  +     * in the implementation of dynamic web sites
  +     */
  +    protected URLLengthFilter urlLengthFilter;
  +
  +    /**
  +     * the host manager keeps track of all hosts and is used by the filters.
  +     */
  +    protected HostManager hostManager;
  +
  +    /**
        * this is the main document fetcher. It contains a thread pool that fetches the
        * documents and stores them
        */
       protected Fetcher fetcher;
   
  -
       /**
        * the thread monitor once was only a monitoring tool, but now has become a
        * vital part of the system that computes statistics and
        * flushes the log file buffers
        */
  -
       protected ThreadMonitor monitor;
   
       /**
  @@ -143,24 +147,18 @@
       protected DocumentStorage storage;
   
       /**
  -     * the URL length filter filters URLs that are too long, i.e. because of errors
  -     * in the implementation of dynamic web sites
  -     */
  -    protected URLLengthFilter urlLengthFilter;
  -
  -    /**
        * initializes all classes and registers anonymous adapter classes as
        * listeners for fetcher events.
        *
        * @param nrThreads  number of fetcher threads to be created
        */
  -    public FetcherMain(int nrThreads)
  +    private FetcherMain(int nrThreads)
       {
           // to make things clear, this method is commented a bit better than
           // the rest of the program...
   
           // this is the main message queue. handlers are registered with
  -        // the queue, and whenever a message is put in it, they are passed to the
  +        // the queue, and whenever a message is put in it, the message is passed to the
           // filters in a "chain of responibility" manner. Every listener can decide
           // to throw the message away
           messageHandler = new MessageHandler();
  @@ -169,7 +167,6 @@
           // matter how it does it, whether it's in a file, in a database or
           // whatever
   
  -
           // example for the (very slow) SQL Server storage:
           // this.storage = new SQLServerStorage("sun.jdbc.odbc.JdbcOdbcDriver","jdbc:odbc:search","sa","...",nrThreads);
   
  @@ -190,6 +187,7 @@
           LuceneStorage luceneStorage = new LuceneStorage();
           luceneStorage.setAnalyzer(new org.apache.lucene.analysis.de.GermanAnalyzer());
           luceneStorage.setCreate(true);
  +	// FIXME: index name and path need to be configurable
           luceneStorage.setIndexName("luceneIndex");
           luceneStorage.setFieldInfo("url", LuceneStorage.INDEX | LuceneStorage.STORE);
           luceneStorage.setFieldInfo("content", LuceneStorage.INDEX | LuceneStorage.STORE | LuceneStorage.TOKEN);
  @@ -202,30 +200,24 @@
           // heat, which evaporates above the processor
           // NullStorage();
   
  +        hostManager = new HostManager(1000);
  +
           // create the filters and add them to the message queue
  +        reFilter = new RobotExclusionFilter(hostManager);
           urlScopeFilter = new URLScopeFilter();
  -
           urlVisitedFilter = new URLVisitedFilter(100000);
  +        knownPathsFilter = new KnownPathsFilter();
  +        urlLengthFilter = new URLLengthFilter(255);
   
           // dnsResolver = new DNSResolver();
  -        hostManager = new HostManager(1000);
  -
  -        reFilter = new RobotExclusionFilter(hostManager);
  -
           fetcher = new Fetcher(nrThreads, storage, storage, hostManager);
   
  -        knownPathsFilter = new KnownPathsFilter();
  -
  -        urlLengthFilter = new URLLengthFilter(255);
  -
           // prevent message box popups
           HTTPConnection.setDefaultAllowUserInteraction(false);
   
           // prevent GZipped files from being decoded
           HTTPConnection.removeDefaultModule(HTTPClient.ContentEncodingModule.class);
   
  -
  -
           // initialize the threads
           fetcher.init();
   
  @@ -266,9 +258,9 @@
   
   
       /**
  -     * Sets the RexString attribute of the FetcherMain object
  +     * Sets the RexString attribute of <code>UrlScopeFilter</code>.
        *
  -     * @param restrictTo                          The new RexString value
  +     * @param restrictTo the new RexString value
        */
       public void setRexString(String restrictTo) throws MalformedPatternException
       {
  @@ -292,6 +284,7 @@
           }
           catch (Exception e)
           {
  +	    // FIXME: replace with logging
               System.out.println("Exception: " + e.getMessage());
               e.printStackTrace();
           }
  @@ -344,7 +337,7 @@
   
   
       /**
  -     * The main program. parsed
  +     * The main program.
        *
        * @param args  The command line arguments
        */
  @@ -357,6 +350,8 @@
           boolean gui = false;
           boolean showInfo = false;
           System.out.println("LARM - LANLab Retrieval Machine - Fetcher - V 1.00 - (C) LANLab 2000-02");
  +
  +	// FIXME: consider using Jakarta Commons' CLI package for command line parameters
           for (int i = 0; i < args.length; i++)
           {
               if (args[i].equals("-start"))
  @@ -419,7 +414,6 @@
                   catch (MalformedURLException e)
                   {
                       System.out.println("Malformed URL");
  -
                   }
               }
           }
  
  
  

--
To unsubscribe, e-mail:   <ma...@jakarta.apache.org>
For additional commands, e-mail: <ma...@jakarta.apache.org>