You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lenya.apache.org by mi...@apache.org on 2004/03/05 12:00:06 UTC

cvs commit: cocoon-lenya/src/java/org/apache/lenya/search/crawler CrawlerConfiguration.java IterativeHTMLCrawler.java

michi       2004/03/05 03:00:06

  Modified:    src/java/org/apache/lenya/search/crawler
                        CrawlerConfiguration.java IterativeHTMLCrawler.java
  Log:
  robots.txt made configurable
  
  Revision  Changes    Path
  1.9       +47 -8     cocoon-lenya/src/java/org/apache/lenya/search/crawler/CrawlerConfiguration.java
  
  Index: CrawlerConfiguration.java
  ===================================================================
  RCS file: /home/cvs/cocoon-lenya/src/java/org/apache/lenya/search/crawler/CrawlerConfiguration.java,v
  retrieving revision 1.8
  retrieving revision 1.9
  diff -u -r1.8 -r1.9
  --- CrawlerConfiguration.java	1 Mar 2004 16:18:19 -0000	1.8
  +++ CrawlerConfiguration.java	5 Mar 2004 11:00:06 -0000	1.9
  @@ -41,6 +41,8 @@
       private String scope_url;
       private String uri_list;
       private String htdocs_dump_dir;
  +    private String robots_file;
  +    private String robots_domain;
   
       /**
        * Creates a new CrawlerConfiguration object.
  @@ -108,13 +110,24 @@
               System.out.println(ce.resolvePath(parameter));
   
               parameter = ce.getHTDocsDumpDir();
  -            System.out.println(parameter);
  +            System.out.println("htdocs-dump-dir/@src: " + parameter);
               System.out.println(ce.resolvePath(parameter));
  +
  +            parameter = ce.getRobotsFile();
  +            if (parameter != null) {
  +                System.out.println("robots/@src: " + parameter);
  +                System.out.println(ce.resolvePath(parameter));
  +            }
  +
  +            parameter = ce.getRobotsDomain();
  +            if (parameter != null) {
  +                System.out.println("robots/@domain: " + parameter);
  +            }
           }
       }
   
       /**
  -     * DOCUMENT ME!
  +     * Extract parameters from configuration
        *
        * @param configuration DOCUMENT ME!
        *
  @@ -128,6 +141,10 @@
           user_agent = du.getElementValue(root, new XPath("user-agent"));
           uri_list = du.getAttributeValue(root, new XPath("uri-list/@src"));
           htdocs_dump_dir = du.getAttributeValue(root, new XPath("htdocs-dump-dir/@src"));
  +        if (du.elementExists(root, new XPath("robots"))) {
  +            robots_file = du.getAttributeValue(root, new XPath("robots/@src"));
  +            robots_domain = du.getAttributeValue(root, new XPath("robots/@domain"));
  +        }
       }
   
       /**
  @@ -175,9 +192,9 @@
       }
   
       /**
  -     * DOCUMENT ME!
  +     * Get htdocs-dump-dir/@src
        *
  -     * @return DOCUMENT ME!
  +     * @return htdocs-dump-dir/@src
        */
       public String getHTDocsDumpDir() {
           log.debug(".getHTDocsDumpDir(): " + htdocs_dump_dir);
  @@ -186,11 +203,33 @@
       }
   
       /**
  -     * DOCUMENT ME!
  +     * Get robots/@src
  +     *
  +     * @return robots/@src
  +     */
  +    public String getRobotsFile() {
  +        log.debug(robots_file);
  +
  +        return robots_file;
  +    }
  +
  +    /**
  +     * Get robots/@domain
  +     *
  +     * @return robots/@domain
  +     */
  +    public String getRobotsDomain() {
  +        log.debug(robots_domain);
  +
  +        return robots_domain;
  +    }
  +
  +    /**
  +     * Resolve path
        *
  -     * @param path DOCUMENT ME!
  +     * @param path Original path
        *
  -     * @return DOCUMENT ME!
  +     * @return Resolved path
        */
       public String resolvePath(String path) {
           if (path.indexOf(File.separator) == 0) {
  
  
  
  1.24      +48 -20    cocoon-lenya/src/java/org/apache/lenya/search/crawler/IterativeHTMLCrawler.java
  
  Index: IterativeHTMLCrawler.java
  ===================================================================
  RCS file: /home/cvs/cocoon-lenya/src/java/org/apache/lenya/search/crawler/IterativeHTMLCrawler.java,v
  retrieving revision 1.23
  retrieving revision 1.24
  diff -u -r1.23 -r1.24
  --- IterativeHTMLCrawler.java	1 Mar 2004 16:18:19 -0000	1.23
  +++ IterativeHTMLCrawler.java	5 Mar 2004 11:00:06 -0000	1.24
  @@ -51,6 +51,35 @@
       private RobotExclusion robot;
   
       /**
  +     * Command line interface
  +     *
  +     * @param args Configuration file crawler.xconf
  +     */
  +    public static void main(String[] args) {
  +        if (args.length == 0) {
  +            System.err.println("Usage: IterativeHTMLCrawler crawler.xconf");
  +
  +            return;
  +        }
  +
  +        try {
  +            if (args.length == 1) {
  +                CrawlerConfiguration ce = new CrawlerConfiguration(args[0]);
  +                new IterativeHTMLCrawler(new File(args[0])).crawl(new URL(ce.getBaseURL()), ce.getScopeURL());
  +	    } else {
  +                System.err.println("Usage: IterativeHTMLCrawler crawler.xconf");
  +/*
  +                new IterativeHTMLCrawler(ce.resolvePath(ce.getURIList()),
  +                    ce.resolvePath(ce.getHTDocsDumpDir()), ce.getUserAgent()).crawl(new URL(
  +                        ce.getBaseURL()), ce.getScopeURL());
  +*/
  +            }
  +        } catch (MalformedURLException e) {
  +            log.error("" + e);
  +        }
  +    }
  +
  +    /**
        * Creates a new IterativeHTMLCrawler object.
        *
        * @param url_list_file File where all dumped files will be listed
  @@ -62,36 +91,35 @@
           this.html_dump_directory = html_dump_directory;
   
           robot = new RobotExclusion(userAgent);
  -        robot.addLocalEntries("cocoon.apache.org", new File("/home/USERNAME/src/cocoon-lenya/robots.txt"));
       }
   
       /**
  -     * Command line interface
  +     * Creates a new IterativeHTMLCrawler object.
        *
  -     * @param args Configuration file crawler.xconf
  +     * @param config Configuration File
        */
  -    public static void main(String[] args) {
  -        if (args.length != 1) {
  -            System.err.println("Usage: IterativeHTMLCrawler crawler.xconf");
  +    public IterativeHTMLCrawler(File config) {
  +        CrawlerConfiguration ce = new CrawlerConfiguration(config.getAbsolutePath());
   
  -            return;
  -        }
   
  -        try {
  -            CrawlerConfiguration ce = new CrawlerConfiguration(args[0]);
  -            new IterativeHTMLCrawler(ce.resolvePath(ce.getURIList()),
  -                ce.resolvePath(ce.getHTDocsDumpDir()), ce.getUserAgent()).crawl(new URL(
  -                    ce.getBaseURL()), ce.getScopeURL());
  -        } catch (MalformedURLException e) {
  -            log.error("" + e);
  +        this.url_list_file = ce.resolvePath(ce.getURIList());
  +        this.html_dump_directory = ce.resolvePath(ce.getHTDocsDumpDir());
  +
  +        robot = new RobotExclusion(ce.getUserAgent());
  +
  +        String robots_file = ce.getRobotsFile();
  +        String robots_domain = ce.getRobotsDomain();
  +        if (robots_file != null && robots_domain != null) {
  +            log.debug(robots_file + " " + robots_domain);
  +            robot.addLocalEntries(robots_domain, new File(ce.resolvePath(robots_file)));
           }
       }
   
       /**
  -     * DOCUMENT ME!
  +     * Crawl
        *
  -     * @param start DOCUMENT ME!
  -     * @param scope DOCUMENT ME!
  +     * @param start Start crawling at this URL
  +     * @param scope Limit crawling to this scope
        */
       public void crawl(URL start, String scope) {
           scopeURL = new String[1];
  @@ -143,7 +171,7 @@
                               dumpHTDoc(urlToCrawl);
                           }
                       } catch (MalformedURLException e) {
  -                        log.error("" + e + " " + urlCandidate);
  +                        log.warn("" + e + " " + urlCandidate);
                       }
                   }
               }
  
  
  

---------------------------------------------------------------------
To unsubscribe, e-mail: lenya-cvs-unsubscribe@cocoon.apache.org
For additional commands, e-mail: lenya-cvs-help@cocoon.apache.org