You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lenya.apache.org by mi...@apache.org on 2004/03/05 12:00:06 UTC
cvs commit: cocoon-lenya/src/java/org/apache/lenya/search/crawler CrawlerConfiguration.java IterativeHTMLCrawler.java
michi 2004/03/05 03:00:06
Modified: src/java/org/apache/lenya/search/crawler
CrawlerConfiguration.java IterativeHTMLCrawler.java
Log:
robots.txt made configurable
Revision Changes Path
1.9 +47 -8 cocoon-lenya/src/java/org/apache/lenya/search/crawler/CrawlerConfiguration.java
Index: CrawlerConfiguration.java
===================================================================
RCS file: /home/cvs/cocoon-lenya/src/java/org/apache/lenya/search/crawler/CrawlerConfiguration.java,v
retrieving revision 1.8
retrieving revision 1.9
diff -u -r1.8 -r1.9
--- CrawlerConfiguration.java 1 Mar 2004 16:18:19 -0000 1.8
+++ CrawlerConfiguration.java 5 Mar 2004 11:00:06 -0000 1.9
@@ -41,6 +41,8 @@
private String scope_url;
private String uri_list;
private String htdocs_dump_dir;
+ private String robots_file;
+ private String robots_domain;
/**
* Creates a new CrawlerConfiguration object.
@@ -108,13 +110,24 @@
System.out.println(ce.resolvePath(parameter));
parameter = ce.getHTDocsDumpDir();
- System.out.println(parameter);
+ System.out.println("htdocs-dump-dir/@src: " + parameter);
System.out.println(ce.resolvePath(parameter));
+
+ parameter = ce.getRobotsFile();
+ if (parameter != null) {
+ System.out.println("robots/@src: " + parameter);
+ System.out.println(ce.resolvePath(parameter));
+ }
+
+ parameter = ce.getRobotsDomain();
+ if (parameter != null) {
+ System.out.println("robots/@domain: " + parameter);
+ }
}
}
/**
- * DOCUMENT ME!
+ * Extract parameters from configuration
*
* @param configuration DOCUMENT ME!
*
@@ -128,6 +141,10 @@
user_agent = du.getElementValue(root, new XPath("user-agent"));
uri_list = du.getAttributeValue(root, new XPath("uri-list/@src"));
htdocs_dump_dir = du.getAttributeValue(root, new XPath("htdocs-dump-dir/@src"));
+ if (du.elementExists(root, new XPath("robots"))) {
+ robots_file = du.getAttributeValue(root, new XPath("robots/@src"));
+ robots_domain = du.getAttributeValue(root, new XPath("robots/@domain"));
+ }
}
/**
@@ -175,9 +192,9 @@
}
/**
- * DOCUMENT ME!
+ * Get htdocs-dump-dir/@src
*
- * @return DOCUMENT ME!
+ * @return htdocs-dump-dir/@src
*/
public String getHTDocsDumpDir() {
log.debug(".getHTDocsDumpDir(): " + htdocs_dump_dir);
@@ -186,11 +203,33 @@
}
/**
- * DOCUMENT ME!
+ * Get robots/@src
+ *
+ * @return robots/@src
+ */
+ public String getRobotsFile() {
+ log.debug(robots_file);
+
+ return robots_file;
+ }
+
+ /**
+ * Get robots/@domain
+ *
+ * @return robots/@domain
+ */
+ public String getRobotsDomain() {
+ log.debug(robots_domain);
+
+ return robots_domain;
+ }
+
+ /**
+ * Resolve path
*
- * @param path DOCUMENT ME!
+ * @param path Original path
*
- * @return DOCUMENT ME!
+ * @return Resolved path
*/
public String resolvePath(String path) {
if (path.indexOf(File.separator) == 0) {
1.24 +48 -20 cocoon-lenya/src/java/org/apache/lenya/search/crawler/IterativeHTMLCrawler.java
Index: IterativeHTMLCrawler.java
===================================================================
RCS file: /home/cvs/cocoon-lenya/src/java/org/apache/lenya/search/crawler/IterativeHTMLCrawler.java,v
retrieving revision 1.23
retrieving revision 1.24
diff -u -r1.23 -r1.24
--- IterativeHTMLCrawler.java 1 Mar 2004 16:18:19 -0000 1.23
+++ IterativeHTMLCrawler.java 5 Mar 2004 11:00:06 -0000 1.24
@@ -51,6 +51,35 @@
private RobotExclusion robot;
/**
+ * Command line interface
+ *
+ * @param args Configuration file crawler.xconf
+ */
+ public static void main(String[] args) {
+ if (args.length == 0) {
+ System.err.println("Usage: IterativeHTMLCrawler crawler.xconf");
+
+ return;
+ }
+
+ try {
+ if (args.length == 1) {
+ CrawlerConfiguration ce = new CrawlerConfiguration(args[0]);
+ new IterativeHTMLCrawler(new File(args[0])).crawl(new URL(ce.getBaseURL()), ce.getScopeURL());
+ } else {
+ System.err.println("Usage: IterativeHTMLCrawler crawler.xconf");
+/*
+ new IterativeHTMLCrawler(ce.resolvePath(ce.getURIList()),
+ ce.resolvePath(ce.getHTDocsDumpDir()), ce.getUserAgent()).crawl(new URL(
+ ce.getBaseURL()), ce.getScopeURL());
+*/
+ }
+ } catch (MalformedURLException e) {
+ log.error("" + e);
+ }
+ }
+
+ /**
* Creates a new IterativeHTMLCrawler object.
*
* @param url_list_file File where all dumped files will be listed
@@ -62,36 +91,35 @@
this.html_dump_directory = html_dump_directory;
robot = new RobotExclusion(userAgent);
- robot.addLocalEntries("cocoon.apache.org", new File("/home/USERNAME/src/cocoon-lenya/robots.txt"));
}
/**
- * Command line interface
+ * Creates a new IterativeHTMLCrawler object.
*
- * @param args Configuration file crawler.xconf
+ * @param config Configuration File
*/
- public static void main(String[] args) {
- if (args.length != 1) {
- System.err.println("Usage: IterativeHTMLCrawler crawler.xconf");
+ public IterativeHTMLCrawler(File config) {
+ CrawlerConfiguration ce = new CrawlerConfiguration(config.getAbsolutePath());
- return;
- }
- try {
- CrawlerConfiguration ce = new CrawlerConfiguration(args[0]);
- new IterativeHTMLCrawler(ce.resolvePath(ce.getURIList()),
- ce.resolvePath(ce.getHTDocsDumpDir()), ce.getUserAgent()).crawl(new URL(
- ce.getBaseURL()), ce.getScopeURL());
- } catch (MalformedURLException e) {
- log.error("" + e);
+ this.url_list_file = ce.resolvePath(ce.getURIList());
+ this.html_dump_directory = ce.resolvePath(ce.getHTDocsDumpDir());
+
+ robot = new RobotExclusion(ce.getUserAgent());
+
+ String robots_file = ce.getRobotsFile();
+ String robots_domain = ce.getRobotsDomain();
+ if (robots_file != null && robots_domain != null) {
+ log.debug(robots_file + " " + robots_domain);
+ robot.addLocalEntries(robots_domain, new File(ce.resolvePath(robots_file)));
}
}
/**
- * DOCUMENT ME!
+ * Crawl
*
- * @param start DOCUMENT ME!
- * @param scope DOCUMENT ME!
+ * @param start Start crawling at this URL
+ * @param scope Limit crawling to this scope
*/
public void crawl(URL start, String scope) {
scopeURL = new String[1];
@@ -143,7 +171,7 @@
dumpHTDoc(urlToCrawl);
}
} catch (MalformedURLException e) {
- log.error("" + e + " " + urlCandidate);
+ log.warn("" + e + " " + urlCandidate);
}
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: lenya-cvs-unsubscribe@cocoon.apache.org
For additional commands, e-mail: lenya-cvs-help@cocoon.apache.org