You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@any23.apache.org by mo...@apache.org on 2012/04/03 11:40:04 UTC

svn commit: r1308786 [2/2] - in /incubator/any23/trunk: ./ core/ core/src/main/java/org/apache/any23/cli/ core/src/main/java/org/apache/any23/writer/ core/src/main/resources/ core/src/test/java/org/apache/any23/cli/ plugins/basic-crawler/ plugins/basic...

Modified: incubator/any23/trunk/plugins/basic-crawler/src/main/java/org/apache/any23/cli/Crawler.java
URL: http://svn.apache.org/viewvc/incubator/any23/trunk/plugins/basic-crawler/src/main/java/org/apache/any23/cli/Crawler.java?rev=1308786&r1=1308785&r2=1308786&view=diff
==============================================================================
--- incubator/any23/trunk/plugins/basic-crawler/src/main/java/org/apache/any23/cli/Crawler.java (original)
+++ incubator/any23/trunk/plugins/basic-crawler/src/main/java/org/apache/any23/cli/Crawler.java Tue Apr  3 09:40:03 2012
@@ -17,24 +17,27 @@
 
 package org.apache.any23.cli;
 
+import com.beust.jcommander.IStringConverter;
+import com.beust.jcommander.Parameter;
+import com.beust.jcommander.ParameterException;
+import com.beust.jcommander.Parameters;
+import com.beust.jcommander.converters.FileConverter;
 import edu.uci.ics.crawler4j.crawler.Page;
 import edu.uci.ics.crawler4j.parser.HtmlParseData;
 import edu.uci.ics.crawler4j.parser.ParseData;
 import org.apache.any23.plugin.crawler.CrawlerListener;
 import org.apache.any23.plugin.crawler.SiteCrawler;
 import org.apache.any23.source.StringDocumentSource;
-import org.apache.commons.cli.CommandLine;
-import org.apache.commons.cli.Option;
-import org.apache.commons.cli.Options;
 import org.kohsuke.MetaInfServices;
 
 import java.io.File;
-import java.io.IOException;
 import java.net.URL;
 import java.util.UUID;
 import java.util.regex.Pattern;
 import java.util.regex.PatternSyntaxException;
 
+import static java.lang.String.format;
+
 /**
  * Implementation of a <b>CLI crawler</b> based on
  * {@link Rover}.
@@ -42,156 +45,118 @@ import java.util.regex.PatternSyntaxExce
  * @author Michele Mostarda (mostarda@fbk.eu)
  */
 @MetaInfServices( value = Tool.class )
-@ToolRunner.Description("Any23 Crawler Command Line Tool.")
+@Parameters(commandNames = "crawler", commandDescription = "Any23 Crawler Command Line Tool.")
 public class Crawler extends Rover {
 
     private final Object roverLock = new Object();
 
-    public static void main(String[] args) {
-        try {
-            System.exit( new Crawler().run(args) );
-        } catch (Exception e) {
-            e.printStackTrace();
-        }
-    }
+    @Parameter(
+       names = { "-pf", "--pagefilter" },
+       description = "Regex used to filter out page URLs during crawling.",
+       converter = PatterConverter.class
+    )
+    private Pattern pageFilter = Pattern.compile( SiteCrawler.DEFAULT_PAGE_FILTER_RE );
+
+    @Parameter(
+       names = { "-sf", "--storagefolder" },
+       description = "Folder used to store crawler temporary data.",
+       converter = FileConverter.class
+    )
+    private File storageFolder = new File(System.getProperty("java.io.tmpdir"), "crawler-metadata-" + UUID.randomUUID().toString());
+
+    @Parameter(names = { "-nc", "--numcrawlers" }, description = "Sets the number of crawlers.")
+    private int numCrawlers = SiteCrawler.DEFAULT_NUM_OF_CRAWLERS;
 
-    @Override
-    public int run(String[] args) {
-        try {
-            final String[] seeds = super.configure(args);
-            if(seeds.length != 1) throw new IllegalArgumentException("Expected just one seed.");
-            final URL seed = new URL(seeds[0]);
+    @Parameter(names = { "-mp", "--maxpages" }, description = "Max number of pages before interrupting crawl.")
+    private int maxPages = Integer.MAX_VALUE;
 
-            final CommandLine commandLine = super.getCommandLine();
+    @Parameter(names = { "-md", "--maxdepth" }, description = "Max allowed crawler depth.")
+    private int maxDepth = Integer.MAX_VALUE;
 
-            final SiteCrawler siteCrawler = new SiteCrawler( getStorageFolder(commandLine) );
+    @Parameter(names = { "-pd", "--politenessdelay" }, description = "Politeness delay in milliseconds.")
+    private int politenessDelay = Integer.MAX_VALUE;
 
-            final Pattern specifiedPageFilter = getPageFilter(commandLine);
-            final Pattern pageFilter = specifiedPageFilter == null ? siteCrawler.defaultFilters : specifiedPageFilter;
+    @Override
+    public void run() throws Exception {
+        super.configure();
 
-            if(commandLine.hasOption("numcrawlers")) {
-                siteCrawler.setNumOfCrawlers( parseInt(commandLine, "numcrawlers") );
-            }
-            if(commandLine.hasOption("maxpages")) {
-                siteCrawler.setMaxPages(parseInt(commandLine, "maxpages"));
-            }
-            if(commandLine.hasOption("maxdepth")) {
-                siteCrawler.setMaxDepth(parseInt(commandLine, "maxdepth"));
-            }
-            if (commandLine.hasOption("politenessdelay")) {
-                final int politenessDelay = parseInt(commandLine, "politenessdelay");
-                if(politenessDelay >= 0) siteCrawler.setPolitenessDelay(politenessDelay);
+        if (inputURIs.size() != 1) {
+            throw new IllegalArgumentException("Expected just one seed.");
+        }
+        final URL seed = new URL(inputURIs.get( 0 ));
+
+        if ( storageFolder.isFile() ) {
+            throw new IllegalStateException( format( "Storage folder %s can not be a file, must be a directory",
+                                                     storageFolder ) );
+        }
+
+        if ( !storageFolder.exists() ) {
+            if ( !storageFolder.mkdirs() ) {
+                throw new IllegalStateException(
+                        format( "Storage folder %s can not be created, please verify you have enough permissions",
+                                                         storageFolder ) );
             }
+        }
+
+        final SiteCrawler siteCrawler = new SiteCrawler( storageFolder );
+        siteCrawler.setNumOfCrawlers( numCrawlers );
+        siteCrawler.setMaxPages( maxPages );
+        siteCrawler.setMaxDepth( maxDepth );
+        siteCrawler.setPolitenessDelay(politenessDelay);
+
+        siteCrawler.addListener(new CrawlerListener() {
+            @Override
+            public void visitedPage(Page page) {
+                final String pageURL = page.getWebURL().getURL();
+                System.err.println( format("Processing page: [%s]", pageURL) );
+
+                final ParseData parseData = page.getParseData();
+                if (parseData instanceof HtmlParseData) {
+                    final HtmlParseData htmlParseData = (HtmlParseData) parseData;
+                    try {
+                        synchronized (roverLock) {
+                            Crawler.super.performExtraction(
+                                    new StringDocumentSource(
+                                            htmlParseData.getHtml(),
+                                            pageURL
 
-            siteCrawler.addListener(new CrawlerListener() {
-                @Override
-                public void visitedPage(Page page) {
-                    final String pageURL = page.getWebURL().getURL();
-                    System.err.println( String.format("Processing page: [%s]", pageURL) );
-
-                    final ParseData parseData = page.getParseData();
-                    if (parseData instanceof HtmlParseData) {
-                        final HtmlParseData htmlParseData = (HtmlParseData) parseData;
-                        try {
-                            synchronized (roverLock) {
-                                Crawler.super.performExtraction(
-                                        new StringDocumentSource(
-                                                htmlParseData.getHtml(),
-                                                pageURL
-
-                                        )
-                                );
-                            }
-                        } catch (Exception e) {
-                            System.err.println(
-                                    String.format("Error while processing page [%s], error: %s .", pageURL, e.getMessage())
+                                    )
                             );
                         }
-                    }
-                }
-            });
-
-            Runtime.getRuntime().addShutdownHook( new Thread() {
-                @Override
-                public void run() {
-                    try {
-                        System.err.println( Crawler.super.printReports() );
-                        // siteCrawler.stop(); // TODO: cause shutdown hanging.
                     } catch (Exception e) {
-                        e.printStackTrace();
+                        System.err.println(format("Error while processing page [%s], error: %s .",
+                                                  pageURL, e.getMessage())
+                        );
                     }
                 }
-            });
-            siteCrawler.start(seed, pageFilter, true);
-            return 0;
-        } catch (Exception e) {
-            if(super.isVerbose()) e.printStackTrace();
-            if(e instanceof ExitCodeException) {
-                return ((ExitCodeException) e).getExitCode();
             }
-            return 1;
-        }
-    }
+        });
 
-    @Override
-    protected Options createOptions() {
-        final Options roverOptions = super.createOptions();
-        addCrawlerOptions(roverOptions);
-        return roverOptions;
+        Runtime.getRuntime().addShutdownHook( new Thread() {
+            @Override
+            public void run() {
+                try {
+                    System.err.println( Crawler.super.printReports() );
+                    // siteCrawler.stop(); // TODO: cause shutdown hanging.
+                } catch (Exception e) {
+                    e.printStackTrace(System.err);
+                }
+            }
+        });
+        siteCrawler.start(seed, pageFilter, true);
     }
 
-    private void addCrawlerOptions(Options options) {
-        options.addOption(
-                new Option("pagefilter"     , true, "Regex used to filter out page URLs during crawling. Default: '" + SiteCrawler.DEFAULT_PAGE_FILTER_RE + "'")
-        );
-        options.addOption(
-                new Option("storagefolder"  , true, "Folder used to store crawler temporary data. Default: [" + System.getProperty("java.io.tmpdir")  + "]")
-        );
-        options.addOption(
-                new Option("numcrawlers"    , true, "Sets the number of crawlers. Default: " + SiteCrawler.DEFAULT_NUM_OF_CRAWLERS)
-        );
-        options.addOption(
-                new Option("maxpages"       , true, "Max number of pages before interrupting crawl. Default: no limit.")
-        );
-        options.addOption(
-                new Option("maxdepth"       , true, "Max allowed crawler depth. Default: no limit.")
-        );
-        options.addOption(
-                new Option("politenessdelay", true, "Politeness delay in milliseconds. Default: no limit.")
-        );
-    }
+    public static final class PatterConverter implements IStringConverter<Pattern> {
 
-    private Pattern getPageFilter(CommandLine commandLine) {
-        if(commandLine.hasOption("pagefilter")) {
+        @Override
+        public Pattern convert( String value ) {
             try {
-                return Pattern.compile( commandLine.getOptionValue("pagefilter") );
+                return Pattern.compile( value );
             } catch (PatternSyntaxException pse) {
-                throw new ExitCodeException("Invalid page filter, must be a regular expression.", 6);
+                throw new ParameterException( format("Invalid page filter, '%s' must be a regular expression.", value) );
             }
         }
-        return null;
-    }
 
-    private File getStorageFolder(CommandLine commandLine) throws IOException {
-        if(commandLine.hasOption("storagefolder")) {
-           final File candidate = new  File( commandLine.getOptionValue("storagefolder") );
-           if(candidate.exists() && candidate.isFile())
-               throw new IllegalArgumentException("The storage folder must be a directory.");
-            return candidate;
-        } else {
-            final File tmpDir = File.createTempFile("crawler-metadata-" + UUID.randomUUID().toString(), "db");
-            tmpDir.delete();
-            return tmpDir;
-        }
-    }
-
-    private int parseInt(CommandLine cl, String option) {
-        final String value = cl.getOptionValue(option);
-        try {
-            return Integer.parseInt(value);
-        } catch (NumberFormatException nfe) {
-            throw new IllegalArgumentException(String.format("Expected integer for %s found '%s' .", option, value));
-        }
     }
 
 }

Modified: incubator/any23/trunk/plugins/basic-crawler/src/test/java/org/apache/any23/cli/CrawlerTest.java
URL: http://svn.apache.org/viewvc/incubator/any23/trunk/plugins/basic-crawler/src/test/java/org/apache/any23/cli/CrawlerTest.java?rev=1308786&r1=1308785&r2=1308786&view=diff
==============================================================================
--- incubator/any23/trunk/plugins/basic-crawler/src/test/java/org/apache/any23/cli/CrawlerTest.java (original)
+++ incubator/any23/trunk/plugins/basic-crawler/src/test/java/org/apache/any23/cli/CrawlerTest.java Tue Apr  3 09:40:03 2012
@@ -17,8 +17,6 @@
 
 package org.apache.any23.cli;
 
-import static org.junit.Assert.*;
-
 import org.apache.any23.Any23OnlineTestBase;
 import org.apache.any23.rdf.RDFUtils;
 import org.apache.any23.util.FileUtils;
@@ -36,6 +34,8 @@ import java.util.concurrent.Future;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.TimeoutException;
 
+import static org.junit.Assert.assertTrue;
+
 /**
  * Test case for {@link Crawler} CLI.
  *
@@ -57,13 +57,17 @@ public class CrawlerTest extends Any23On
             new Runnable() {
                 @Override
                 public void run() {
-                    Crawler.main(
-                            String.format(
-                                    "-f nquads -maxpages 50 -maxdepth 1 -politenessdelay 500 -o %s " +
-                                    "http://eventiesagre.it/",
-                                    outFile.getAbsolutePath()
-                            ).split(" ")
-                    );
+                    try {
+                        ToolRunner.main(
+                                String.format(
+                                        "crawler -f nquads --maxpages 50 --maxdepth 1 --politenessdelay 500 -o %s " +
+                                        "http://eventiesagre.it/",
+                                        outFile.getAbsolutePath()
+                                ).split(" ")
+                        );
+                    } catch (Exception e) {
+                        e.printStackTrace();
+                    }
                 }
             }
         );
@@ -80,7 +84,7 @@ public class CrawlerTest extends Any23On
 
         final String[] lines = FileUtils.readFileLines(outFile);
         final StringBuilder allLinesExceptLast = new StringBuilder();
-        for(int i = 0; i < lines.length - 1; i++) {
+        for (int i = 0; i < lines.length - 1; i++) {
             allLinesExceptLast.append(lines[i]);
         }
 

Modified: incubator/any23/trunk/pom.xml
URL: http://svn.apache.org/viewvc/incubator/any23/trunk/pom.xml?rev=1308786&r1=1308785&r2=1308786&view=diff
==============================================================================
--- incubator/any23/trunk/pom.xml (original)
+++ incubator/any23/trunk/pom.xml Tue Apr  3 09:40:03 2012
@@ -209,6 +209,7 @@
     <javac.target.version>1.6</javac.target.version>
     <maven.build.timestamp.format>yyyy-MM-dd HH:mm:ssZ</maven.build.timestamp.format>
     <implementation.build>${scmBranch}@r${buildNumber}</implementation.build>
+    <implementation.build.tstamp>${implementation.build}; ${maven.build.timestamp}</implementation.build.tstamp>
     <maven.javadoc.plugin.version>2.8</maven.javadoc.plugin.version>
     <slf4j.logger.version>1.5.6</slf4j.logger.version>
     <sesame.version>2.6.1</sesame.version>
@@ -381,6 +382,12 @@
         <version>1.1.0</version>
       </dependency>
       <!-- END: Plugins specific dependencies -->
+
+      <dependency>
+        <groupId>com.beust</groupId>
+        <artifactId>jcommander</artifactId>
+        <version>1.23</version>
+      </dependency>
     </dependencies>
   </dependencyManagement>