You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lenya.apache.org by gr...@apache.org on 2004/12/27 15:59:01 UTC
svn commit: r123417 - /lenya/trunk/src/java/org/apache/lenya/search/crawler/CrawlerConfiguration.java /lenya/trunk/src/java/org/apache/lenya/search/crawler/IterativeHTMLCrawler.java
Author: gregor
Date: Mon Dec 27 06:59:00 2004
New Revision: 123417
URL: http://svn.apache.org/viewcvs?view=rev&rev=123417
Log:
Refactored IterativeHTMLCrawler to implement Configurable
Removed:
lenya/trunk/src/java/org/apache/lenya/search/crawler/CrawlerConfiguration.java
Modified:
lenya/trunk/src/java/org/apache/lenya/search/crawler/IterativeHTMLCrawler.java
Deleted: /lenya/trunk/src/java/org/apache/lenya/search/crawler/CrawlerConfiguration.java
Url: http://svn.apache.org/viewcvs/lenya/trunk/src/java/org/apache/lenya/search/crawler/CrawlerConfiguration.java?view=auto&rev=123416
==============================================================================
Modified: lenya/trunk/src/java/org/apache/lenya/search/crawler/IterativeHTMLCrawler.java
Url: http://svn.apache.org/viewcvs/lenya/trunk/src/java/org/apache/lenya/search/crawler/IterativeHTMLCrawler.java?view=diff&rev=123417&p1=lenya/trunk/src/java/org/apache/lenya/search/crawler/IterativeHTMLCrawler.java&r1=123416&p2=lenya/trunk/src/java/org/apache/lenya/search/crawler/IterativeHTMLCrawler.java&r2=123417
==============================================================================
--- lenya/trunk/src/java/org/apache/lenya/search/crawler/IterativeHTMLCrawler.java (original)
+++ lenya/trunk/src/java/org/apache/lenya/search/crawler/IterativeHTMLCrawler.java Mon Dec 27 06:59:00 2004
@@ -28,29 +28,37 @@
import websphinx.RobotExclusion;
+import org.apache.avalon.framework.configuration.Configurable;
+import org.apache.avalon.framework.configuration.Configuration;
+import org.apache.avalon.framework.configuration.DefaultConfigurationBuilder;
import org.apache.log4j.Category;
/**
* Crawl iteratively
*/
-public class IterativeHTMLCrawler {
+public class IterativeHTMLCrawler implements Configurable {
static Category log = Category.getInstance(IterativeHTMLCrawler.class);
java.util.Vector urlsToCrawl;
java.util.TreeSet urlsToCrawlLowerCase;
- String url_list_file = "url_file.txt";
- String html_dump_directory = "html_dump";
+ String uriList = "url_file.txt";
+ String htdocsDumpDir = "html_dump";
+ private String baseURL;
private String rootURL;
private String[] scopeURL;
+ private String userAgent;
private RobotExclusion robot;
+ private String robotsFile;
+ private String robotsDomain;
+ private String configurationFilePath;
/**
* Command line interface
*
* @param args Configuration file crawler.xconf
*/
- public static void main(String[] args) {
+ public void main(String[] args) {
if (args.length == 0) {
System.err.println("Usage: IterativeHTMLCrawler crawler.xconf");
@@ -59,15 +67,17 @@
try {
if (args.length == 1) {
- CrawlerConfiguration ce = new CrawlerConfiguration(args[0]);
- new IterativeHTMLCrawler(new File(args[0])).crawl(new URL(ce.getBaseURL()), ce.getScopeURL());
+ configurationFilePath = args[0];
+ try {
+ DefaultConfigurationBuilder builder = new DefaultConfigurationBuilder();
+ Configuration configuration = builder.buildFromFile(configurationFilePath);
+ configure(configuration);
+ } catch (Exception e) {
+ System.err.println("Cannot load crawler configuration!");
+ }
+ new IterativeHTMLCrawler(new File(args[0])).crawl(new URL(baseURL), scopeURL[0]);
} else {
System.err.println("Usage: IterativeHTMLCrawler crawler.xconf");
-/*
- new IterativeHTMLCrawler(ce.resolvePath(ce.getURIList()),
- ce.resolvePath(ce.getHTDocsDumpDir()), ce.getUserAgent()).crawl(new URL(
- ce.getBaseURL()), ce.getScopeURL());
-*/
}
} catch (MalformedURLException e) {
log.error("" + e);
@@ -75,15 +85,34 @@
}
/**
+ * DOCUMENT ME!
+ *
+ * @param configuration DOCUMENT ME!
+ *
+ * @throws org.apache.avalon.framework.configuration.ConfigurationException DOCUMENT ME!
+ */
+ public void configure(org.apache.avalon.framework.configuration.Configuration configuration)
+ throws org.apache.avalon.framework.configuration.ConfigurationException {
+
+ this.baseURL = configuration.getChild("base-url").getAttribute("href");
+ this.scopeURL[0] = configuration.getChild("scope-url").getAttribute("href");
+ this.userAgent = configuration.getChild("user-agent").getValue();
+ this.uriList = configuration.getChild("uri-list").getAttribute("src");
+ this.htdocsDumpDir = configuration.getChild("htdocs-dump-dir").getAttribute("src");
+ this.robotsFile = configuration.getChild("robots").getAttribute("src");
+ this.robotsDomain = configuration.getChild("robots").getAttribute("domain");
+ }
+
+ /**
* Creates a new IterativeHTMLCrawler object.
*
- * @param url_list_file File where all dumped files will be listed
- * @param html_dump_directory Directory where htdocs should be dumped
+ * @param uriList File where all dumped files will be listed
+ * @param htdocsDumpDir Directory where htdocs should be dumped
* @param userAgent User-agent for robots.txt
*/
- public IterativeHTMLCrawler(String url_list_file, String html_dump_directory, String userAgent) {
- this.url_list_file = url_list_file;
- this.html_dump_directory = html_dump_directory;
+ public IterativeHTMLCrawler(String uriList, String htdocsDumpDir, String userAgent) {
+ this.uriList = uriList;
+ this.htdocsDumpDir = htdocsDumpDir;
robot = new RobotExclusion(userAgent);
}
@@ -94,19 +123,19 @@
* @param config Configuration File
*/
public IterativeHTMLCrawler(File config) {
- CrawlerConfiguration ce = new CrawlerConfiguration(config.getAbsolutePath());
-
-
- this.url_list_file = ce.resolvePath(ce.getURIList());
- this.html_dump_directory = ce.resolvePath(ce.getHTDocsDumpDir());
+ try {
+ DefaultConfigurationBuilder builder = new DefaultConfigurationBuilder();
+ Configuration configuration = builder.buildFromFile(config);
+ configure(configuration);
+ } catch (Exception e) {
+ System.err.println("Cannot load crawler configuration! ");
+ }
- robot = new RobotExclusion(ce.getUserAgent());
+ robot = new RobotExclusion(this.userAgent);
- String robots_file = ce.getRobotsFile();
- String robots_domain = ce.getRobotsDomain();
- if (robots_file != null && robots_domain != null) {
- log.debug(robots_file + " " + robots_domain);
- robot.addLocalEntries(robots_domain, new File(ce.resolvePath(robots_file)));
+ if (this.robotsFile != null && this.robotsDomain != null) {
+ log.debug(this.robotsFile + " " + this.robotsDomain);
+ robot.addLocalEntries(robotsDomain, new File(robotsFile));
}
}
@@ -178,12 +207,12 @@
// Write all crawled URLs into file
try {
- File parent = new File(new File(url_list_file).getParent());
+ File parent = new File(new File(uriList).getParent());
if (!parent.isDirectory()) {
parent.mkdirs();
log.warn("Directory has been created: " + parent);
}
- java.io.PrintWriter out = new java.io.PrintWriter(new FileOutputStream(url_list_file));
+ java.io.PrintWriter out = new java.io.PrintWriter(new FileOutputStream(uriList));
for (int i = 0; i < urlsToCrawl.size(); i++) {
out.println("" + urlsToCrawl.elementAt(i));
@@ -454,7 +483,7 @@
public void dumpHTDoc(URL url) {
String ext = getExtension(url);
- String filename = html_dump_directory + url.getFile();
+ String filename = htdocsDumpDir + url.getFile();
File file = new File(filename);
if (filename.charAt(filename.length() - 1) == '/') {
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@lenya.apache.org
For additional commands, e-mail: commits-help@lenya.apache.org