You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@any23.apache.org by mo...@apache.org on 2012/03/02 15:13:36 UTC

svn commit: r1296216 - in /incubator/any23/trunk/plugins/basic-crawler: pom.xml src/main/java/org/apache/any23/cli/Crawler.java src/main/java/org/apache/any23/plugin/crawler/SiteCrawler.java src/test/java/org/apache/any23/cli/CrawlerTest.java

Author: mostarda
Date: Fri Mar  2 14:13:35 2012
New Revision: 1296216

URL: http://svn.apache.org/viewvc?rev=1296216&view=rev
Log:
Migrated to crawler4j version 3.3 . This commit is related to issue #ANY23-37 .

Modified:
    incubator/any23/trunk/plugins/basic-crawler/pom.xml
    incubator/any23/trunk/plugins/basic-crawler/src/main/java/org/apache/any23/cli/Crawler.java
    incubator/any23/trunk/plugins/basic-crawler/src/main/java/org/apache/any23/plugin/crawler/SiteCrawler.java
    incubator/any23/trunk/plugins/basic-crawler/src/test/java/org/apache/any23/cli/CrawlerTest.java

Modified: incubator/any23/trunk/plugins/basic-crawler/pom.xml
URL: http://svn.apache.org/viewvc/incubator/any23/trunk/plugins/basic-crawler/pom.xml?rev=1296216&r1=1296215&r2=1296216&view=diff
==============================================================================
--- incubator/any23/trunk/plugins/basic-crawler/pom.xml (original)
+++ incubator/any23/trunk/plugins/basic-crawler/pom.xml Fri Mar  2 14:13:35 2012
@@ -49,39 +49,14 @@
       <scope>provided</scope>
     </dependency>
 
-    <!-- BEGIN: Crawler4j -->
-    <dependency>
-      <groupId>edu.uci.ics</groupId>
-      <artifactId>crawler4j</artifactId>
-      <version>2.6.1</version>
+    <!-- Crawler4j -->
+ 	<dependency>
+        <groupId>edu.uci.ics</groupId>
+        <artifactId>crawler4j</artifactId>
+        <version>3.3</version>
+        <type>jar</type>
+        <scope>compile</scope>
     </dependency>
-    <dependency>
-      <groupId>com.sleepycat</groupId>
-      <artifactId>je</artifactId>
-      <version>4.0.92</version>
-    </dependency>
-    <dependency>
-      <groupId>it.unimi.dsi</groupId>
-      <artifactId>fastutil</artifactId>
-      <version>6.4.1</version>
-    </dependency>
-    <dependency>
-      <groupId>it.unimi.dsi</groupId>
-      <artifactId>dsiutils</artifactId>
-      <version>2.0.1</version>
-    </dependency>
-    <dependency>
-      <groupId>org.apache.httpcomponents</groupId>
-      <artifactId>httpclient</artifactId>
-      <version>4.1</version>
-    </dependency>
-    <!--TODO: resolve dependency conflict.-->
-    <dependency>
-      <groupId>commons-codec</groupId>
-      <artifactId>commons-codec</artifactId>
-      <version>1.4</version>
-    </dependency>
-    <!-- END: Crawler4j -->
   </dependencies>
 
   <build>

Modified: incubator/any23/trunk/plugins/basic-crawler/src/main/java/org/apache/any23/cli/Crawler.java
URL: http://svn.apache.org/viewvc/incubator/any23/trunk/plugins/basic-crawler/src/main/java/org/apache/any23/cli/Crawler.java?rev=1296216&r1=1296215&r2=1296216&view=diff
==============================================================================
--- incubator/any23/trunk/plugins/basic-crawler/src/main/java/org/apache/any23/cli/Crawler.java (original)
+++ incubator/any23/trunk/plugins/basic-crawler/src/main/java/org/apache/any23/cli/Crawler.java Fri Mar  2 14:13:35 2012
@@ -18,6 +18,8 @@
 package org.apache.any23.cli;
 
 import edu.uci.ics.crawler4j.crawler.Page;
+import edu.uci.ics.crawler4j.parser.HtmlParseData;
+import edu.uci.ics.crawler4j.parser.ParseData;
 import org.apache.any23.plugin.crawler.CrawlerListener;
 import org.apache.any23.plugin.crawler.SiteCrawler;
 import org.apache.any23.source.StringDocumentSource;
@@ -86,20 +88,25 @@ public class Crawler extends Rover {
                 public void visitedPage(Page page) {
                     final String pageURL = page.getWebURL().getURL();
                     System.err.println( String.format("Processing page: [%s]", pageURL) );
-                    try {
-                        synchronized (roverLock) {
-                            Crawler.super.performExtraction(
-                                    new StringDocumentSource(
-                                            page.getHTML(),
-                                            pageURL
 
-                                    )
+                    final ParseData parseData = page.getParseData();
+                    if (parseData instanceof HtmlParseData) {
+                        final HtmlParseData htmlParseData = (HtmlParseData) parseData;
+                        try {
+                            synchronized (roverLock) {
+                                Crawler.super.performExtraction(
+                                        new StringDocumentSource(
+                                                htmlParseData.getHtml(),
+                                                pageURL
+
+                                        )
+                                );
+                            }
+                        } catch (Exception e) {
+                            System.err.println(
+                                    String.format("Error while processing page [%s], error: %s .", pageURL, e.getMessage())
                             );
                         }
-                    } catch (Exception e) {
-                        System.err.println(
-                                String.format("Error while processing page [%s], error: %s .", pageURL, e.getMessage())
-                        );
                     }
                 }
             });

Modified: incubator/any23/trunk/plugins/basic-crawler/src/main/java/org/apache/any23/plugin/crawler/SiteCrawler.java
URL: http://svn.apache.org/viewvc/incubator/any23/trunk/plugins/basic-crawler/src/main/java/org/apache/any23/plugin/crawler/SiteCrawler.java?rev=1296216&r1=1296215&r2=1296216&view=diff
==============================================================================
--- incubator/any23/trunk/plugins/basic-crawler/src/main/java/org/apache/any23/plugin/crawler/SiteCrawler.java (original)
+++ incubator/any23/trunk/plugins/basic-crawler/src/main/java/org/apache/any23/plugin/crawler/SiteCrawler.java Fri Mar  2 14:13:35 2012
@@ -17,8 +17,12 @@
 
 package org.apache.any23.plugin.crawler;
 
+import edu.uci.ics.crawler4j.crawler.CrawlConfig;
 import edu.uci.ics.crawler4j.crawler.CrawlController;
 import edu.uci.ics.crawler4j.crawler.WebCrawler;
+import edu.uci.ics.crawler4j.fetcher.PageFetcher;
+import edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig;
+import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer;
 
 import java.io.File;
 import java.net.URL;
@@ -85,19 +89,9 @@ public class SiteCrawler {
     private Class<? extends WebCrawler> webCrawler = DEFAULT_WEB_CRAWLER;
 
     /**
-     * Max allowed depth, <code>-1</code> means no limit.
+     * Internal crawler configuration.
      */
-    private int maxDepth = -1;
-
-    /**
-     *  Max allowed pages, <code>-1</code> means no limit.
-     */
-    private int maxPages = -1;
-
-    /**
-     * Subsequent call politeness delay, <code>-1</code> means no limit.
-     */
-    private int politenessDelay = -1;
+    private final CrawlConfig crawlConfig;
 
     /**
      * Internal executor service.
@@ -111,7 +105,15 @@ public class SiteCrawler {
      */
     public SiteCrawler(File storageFolder) {
         try {
-            controller = new CrawlController( storageFolder.getAbsolutePath() );
+            crawlConfig = new CrawlConfig();
+            crawlConfig.setCrawlStorageFolder( storageFolder.getAbsolutePath() );
+            
+            final PageFetcher pageFetcher = new PageFetcher(crawlConfig);
+
+            RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
+            final RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
+            
+            controller = new CrawlController(crawlConfig, pageFetcher, robotstxtServer);
         } catch (Exception e) {
             throw new IllegalArgumentException("Error while initializing crawler controller.", e);
         }
@@ -139,7 +141,7 @@ public class SiteCrawler {
     }
 
     /**
-     * Sets the actual crawler clas.
+     * Sets the actual crawler class.
      *
      * @param c a not <code>class</code>.
      */
@@ -152,7 +154,7 @@ public class SiteCrawler {
      * @return the max allowed crawl depth, <code>-1</code> means no limit.
      */
     public int getMaxDepth() {
-        return maxDepth;
+        return crawlConfig.getMaxDepthOfCrawling();
     }
 
     /**
@@ -162,19 +164,14 @@ public class SiteCrawler {
      */
     public void setMaxDepth(int maxDepth) {
         if(maxDepth < -1 || maxDepth == 0) throw new IllegalArgumentException("Invalid maxDepth, must be -1 or > 0");
-        if(maxDepth > 0) try {
-            controller.setMaximumCrawlDepth(maxDepth);
-        } catch (Exception e) {
-            throw new IllegalArgumentException("Error while setting maxDepth.", e);
-        }
-        this.maxDepth = maxDepth;
+        crawlConfig.setMaxDepthOfCrawling(maxDepth);
     }
 
     /**
      * @return max number of allowed pages.
      */
     public int getMaxPages() {
-        return maxPages;
+        return crawlConfig.getMaxPagesToFetch();
     }
 
     /**
@@ -184,26 +181,23 @@ public class SiteCrawler {
      */
     public void setMaxPages(int maxPages) {
         if(maxPages < -1 || maxPages == 0) throw new IllegalArgumentException("Invalid maxPages, must be -1 or > 0");
-        if(maxPages > 0) controller.setMaximumPagesToFetch(maxPages);
-        this.maxPages = maxPages;
+        crawlConfig.setMaxPagesToFetch(maxPages);
     }
 
     /**
      * @return the politeness delay in milliseconds.
      */
     public int getPolitenessDelay() {
-        return politenessDelay;
+        return crawlConfig.getPolitenessDelay();
     }
 
     /**
-     * Sets the politeness delay. <code>-1</code> means no politeness.
+     * Sets the politeness delay.
      *
      * @param millis delay in milliseconds.
      */
     public void setPolitenessDelay(int millis) {
-        if(millis < -1) throw new IllegalArgumentException("Invalid politenessDelay, must be >= -1");
-        if(millis >= 0) controller.setPolitenessDelay(millis);
-        this.politenessDelay = millis;
+        if(millis >= 0) crawlConfig.setPolitenessDelay(millis);
     }
 
     /**

Modified: incubator/any23/trunk/plugins/basic-crawler/src/test/java/org/apache/any23/cli/CrawlerTest.java
URL: http://svn.apache.org/viewvc/incubator/any23/trunk/plugins/basic-crawler/src/test/java/org/apache/any23/cli/CrawlerTest.java?rev=1296216&r1=1296215&r2=1296216&view=diff
==============================================================================
--- incubator/any23/trunk/plugins/basic-crawler/src/test/java/org/apache/any23/cli/CrawlerTest.java (original)
+++ incubator/any23/trunk/plugins/basic-crawler/src/test/java/org/apache/any23/cli/CrawlerTest.java Fri Mar  2 14:13:35 2012
@@ -33,6 +33,7 @@ import java.io.IOException;
 import java.util.concurrent.Executors;
 import java.util.concurrent.Future;
 import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
 
 /**
  * Test case for {@link Crawler} CLI.
@@ -70,6 +71,9 @@ public class CrawlerTest extends Any23On
             future.get(10, TimeUnit.SECONDS);
         } catch (Exception e) {
             // OK.
+            if( ! (e instanceof TimeoutException) ) {
+                e.printStackTrace();
+            }
         }
         Assert.assertTrue("The output file has not been created.", outFile.exists());