You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@lucene.apache.org by ot...@apache.org on 2002/09/15 02:38:14 UTC
cvs commit: jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher FetcherMain.java
otis 2002/09/14 17:38:14
Modified: contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher
FetcherMain.java
Log:
- Made constructor private, added a few FIXMEs, etc.
Revision Changes Path
1.6 +30 -36 jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherMain.java
Index: FetcherMain.java
===================================================================
RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/FetcherMain.java,v
retrieving revision 1.5
retrieving revision 1.6
diff -u -r1.5 -r1.6
--- FetcherMain.java 18 Jun 2002 00:45:10 -0000 1.5
+++ FetcherMain.java 15 Sep 2002 00:38:14 -0000 1.6
@@ -56,16 +56,16 @@
import de.lanlab.larm.threads.ThreadPoolObserver;
import de.lanlab.larm.threads.ThreadPool;
-import java.net.MalformedURLException;
-import java.net.URL;
-import java.util.*;
import de.lanlab.larm.gui.*;
import de.lanlab.larm.util.*;
import de.lanlab.larm.storage.*;
import de.lanlab.larm.net.*;
-import javax.swing.UIManager;
import HTTPClient.*;
import org.apache.oro.text.regex.MalformedPatternException;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.*;
+import javax.swing.UIManager;
/**
@@ -110,11 +110,6 @@
protected RobotExclusionFilter reFilter;
/**
- * the host manager keeps track of all hosts and is used by the filters.
- */
- protected HostManager hostManager;
-
- /**
* this rather flaky filter just filters out some URLs, i.e. different views
* of Apache the apache DirIndex module. Has to be made
* configurable in near future
@@ -122,18 +117,27 @@
protected KnownPathsFilter knownPathsFilter;
/**
+ * the URL length filter filters URLs that are too long, i.e. because of errors
+ * in the implementation of dynamic web sites
+ */
+ protected URLLengthFilter urlLengthFilter;
+
+ /**
+ * the host manager keeps track of all hosts and is used by the filters.
+ */
+ protected HostManager hostManager;
+
+ /**
* this is the main document fetcher. It contains a thread pool that fetches the
* documents and stores them
*/
protected Fetcher fetcher;
-
/**
* the thread monitor once was only a monitoring tool, but now has become a
* vital part of the system that computes statistics and
* flushes the log file buffers
*/
-
protected ThreadMonitor monitor;
/**
@@ -143,24 +147,18 @@
protected DocumentStorage storage;
/**
- * the URL length filter filters URLs that are too long, i.e. because of errors
- * in the implementation of dynamic web sites
- */
- protected URLLengthFilter urlLengthFilter;
-
- /**
* initializes all classes and registers anonymous adapter classes as
* listeners for fetcher events.
*
* @param nrThreads number of fetcher threads to be created
*/
- public FetcherMain(int nrThreads)
+ private FetcherMain(int nrThreads)
{
// to make things clear, this method is commented a bit better than
// the rest of the program...
// this is the main message queue. handlers are registered with
- // the queue, and whenever a message is put in it, they are passed to the
+ // the queue, and whenever a message is put in it, the message is passed to the
// filters in a "chain of responibility" manner. Every listener can decide
// to throw the message away
messageHandler = new MessageHandler();
@@ -169,7 +167,6 @@
// matter how it does it, whether it's in a file, in a database or
// whatever
-
// example for the (very slow) SQL Server storage:
// this.storage = new SQLServerStorage("sun.jdbc.odbc.JdbcOdbcDriver","jdbc:odbc:search","sa","...",nrThreads);
@@ -190,6 +187,7 @@
LuceneStorage luceneStorage = new LuceneStorage();
luceneStorage.setAnalyzer(new org.apache.lucene.analysis.de.GermanAnalyzer());
luceneStorage.setCreate(true);
+ // FIXME: index name and path need to be configurable
luceneStorage.setIndexName("luceneIndex");
luceneStorage.setFieldInfo("url", LuceneStorage.INDEX | LuceneStorage.STORE);
luceneStorage.setFieldInfo("content", LuceneStorage.INDEX | LuceneStorage.STORE | LuceneStorage.TOKEN);
@@ -202,30 +200,24 @@
// heat, which evaporates above the processor
// NullStorage();
+ hostManager = new HostManager(1000);
+
// create the filters and add them to the message queue
+ reFilter = new RobotExclusionFilter(hostManager);
urlScopeFilter = new URLScopeFilter();
-
urlVisitedFilter = new URLVisitedFilter(100000);
+ knownPathsFilter = new KnownPathsFilter();
+ urlLengthFilter = new URLLengthFilter(255);
// dnsResolver = new DNSResolver();
- hostManager = new HostManager(1000);
-
- reFilter = new RobotExclusionFilter(hostManager);
-
fetcher = new Fetcher(nrThreads, storage, storage, hostManager);
- knownPathsFilter = new KnownPathsFilter();
-
- urlLengthFilter = new URLLengthFilter(255);
-
// prevent message box popups
HTTPConnection.setDefaultAllowUserInteraction(false);
// prevent GZipped files from being decoded
HTTPConnection.removeDefaultModule(HTTPClient.ContentEncodingModule.class);
-
-
// initialize the threads
fetcher.init();
@@ -266,9 +258,9 @@
/**
- * Sets the RexString attribute of the FetcherMain object
+ * Sets the RexString attribute of <code>UrlScopeFilter</code>.
*
- * @param restrictTo The new RexString value
+ * @param restrictTo the new RexString value
*/
public void setRexString(String restrictTo) throws MalformedPatternException
{
@@ -292,6 +284,7 @@
}
catch (Exception e)
{
+ // FIXME: replace with logging
System.out.println("Exception: " + e.getMessage());
e.printStackTrace();
}
@@ -344,7 +337,7 @@
/**
- * The main program. parsed
+ * The main program.
*
* @param args The command line arguments
*/
@@ -357,6 +350,8 @@
boolean gui = false;
boolean showInfo = false;
System.out.println("LARM - LANLab Retrieval Machine - Fetcher - V 1.00 - (C) LANLab 2000-02");
+
+ // FIXME: consider using Jakarta Commons' CLI package for command line parameters
for (int i = 0; i < args.length; i++)
{
if (args[i].equals("-start"))
@@ -419,7 +414,6 @@
catch (MalformedURLException e)
{
System.out.println("Malformed URL");
-
}
}
}
--
To unsubscribe, e-mail: <ma...@jakarta.apache.org>
For additional commands, e-mail: <ma...@jakarta.apache.org>