You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@any23.apache.org by mo...@apache.org on 2012/04/03 11:40:04 UTC
svn commit: r1308786 [2/2] - in /incubator/any23/trunk: ./ core/
core/src/main/java/org/apache/any23/cli/
core/src/main/java/org/apache/any23/writer/ core/src/main/resources/
core/src/test/java/org/apache/any23/cli/ plugins/basic-crawler/
plugins/basic...
Modified: incubator/any23/trunk/plugins/basic-crawler/src/main/java/org/apache/any23/cli/Crawler.java
URL: http://svn.apache.org/viewvc/incubator/any23/trunk/plugins/basic-crawler/src/main/java/org/apache/any23/cli/Crawler.java?rev=1308786&r1=1308785&r2=1308786&view=diff
==============================================================================
--- incubator/any23/trunk/plugins/basic-crawler/src/main/java/org/apache/any23/cli/Crawler.java (original)
+++ incubator/any23/trunk/plugins/basic-crawler/src/main/java/org/apache/any23/cli/Crawler.java Tue Apr 3 09:40:03 2012
@@ -17,24 +17,27 @@
package org.apache.any23.cli;
+import com.beust.jcommander.IStringConverter;
+import com.beust.jcommander.Parameter;
+import com.beust.jcommander.ParameterException;
+import com.beust.jcommander.Parameters;
+import com.beust.jcommander.converters.FileConverter;
import edu.uci.ics.crawler4j.crawler.Page;
import edu.uci.ics.crawler4j.parser.HtmlParseData;
import edu.uci.ics.crawler4j.parser.ParseData;
import org.apache.any23.plugin.crawler.CrawlerListener;
import org.apache.any23.plugin.crawler.SiteCrawler;
import org.apache.any23.source.StringDocumentSource;
-import org.apache.commons.cli.CommandLine;
-import org.apache.commons.cli.Option;
-import org.apache.commons.cli.Options;
import org.kohsuke.MetaInfServices;
import java.io.File;
-import java.io.IOException;
import java.net.URL;
import java.util.UUID;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
+import static java.lang.String.format;
+
/**
* Implementation of a <b>CLI crawler</b> based on
* {@link Rover}.
@@ -42,156 +45,118 @@ import java.util.regex.PatternSyntaxExce
* @author Michele Mostarda (mostarda@fbk.eu)
*/
@MetaInfServices( value = Tool.class )
-@ToolRunner.Description("Any23 Crawler Command Line Tool.")
+@Parameters(commandNames = "crawler", commandDescription = "Any23 Crawler Command Line Tool.")
public class Crawler extends Rover {
private final Object roverLock = new Object();
- public static void main(String[] args) {
- try {
- System.exit( new Crawler().run(args) );
- } catch (Exception e) {
- e.printStackTrace();
- }
- }
+ @Parameter(
+ names = { "-pf", "--pagefilter" },
+ description = "Regex used to filter out page URLs during crawling.",
+ converter = PatterConverter.class
+ )
+ private Pattern pageFilter = Pattern.compile( SiteCrawler.DEFAULT_PAGE_FILTER_RE );
+
+ @Parameter(
+ names = { "-sf", "--storagefolder" },
+ description = "Folder used to store crawler temporary data.",
+ converter = FileConverter.class
+ )
+ private File storageFolder = new File(System.getProperty("java.io.tmpdir"), "crawler-metadata-" + UUID.randomUUID().toString());
+
+ @Parameter(names = { "-nc", "--numcrawlers" }, description = "Sets the number of crawlers.")
+ private int numCrawlers = SiteCrawler.DEFAULT_NUM_OF_CRAWLERS;
- @Override
- public int run(String[] args) {
- try {
- final String[] seeds = super.configure(args);
- if(seeds.length != 1) throw new IllegalArgumentException("Expected just one seed.");
- final URL seed = new URL(seeds[0]);
+ @Parameter(names = { "-mp", "--maxpages" }, description = "Max number of pages before interrupting crawl.")
+ private int maxPages = Integer.MAX_VALUE;
- final CommandLine commandLine = super.getCommandLine();
+ @Parameter(names = { "-md", "--maxdepth" }, description = "Max allowed crawler depth.")
+ private int maxDepth = Integer.MAX_VALUE;
- final SiteCrawler siteCrawler = new SiteCrawler( getStorageFolder(commandLine) );
+ @Parameter(names = { "-pd", "--politenessdelay" }, description = "Politeness delay in milliseconds.")
+ private int politenessDelay = Integer.MAX_VALUE;
- final Pattern specifiedPageFilter = getPageFilter(commandLine);
- final Pattern pageFilter = specifiedPageFilter == null ? siteCrawler.defaultFilters : specifiedPageFilter;
+ @Override
+ public void run() throws Exception {
+ super.configure();
- if(commandLine.hasOption("numcrawlers")) {
- siteCrawler.setNumOfCrawlers( parseInt(commandLine, "numcrawlers") );
- }
- if(commandLine.hasOption("maxpages")) {
- siteCrawler.setMaxPages(parseInt(commandLine, "maxpages"));
- }
- if(commandLine.hasOption("maxdepth")) {
- siteCrawler.setMaxDepth(parseInt(commandLine, "maxdepth"));
- }
- if (commandLine.hasOption("politenessdelay")) {
- final int politenessDelay = parseInt(commandLine, "politenessdelay");
- if(politenessDelay >= 0) siteCrawler.setPolitenessDelay(politenessDelay);
+ if (inputURIs.size() != 1) {
+ throw new IllegalArgumentException("Expected just one seed.");
+ }
+ final URL seed = new URL(inputURIs.get( 0 ));
+
+ if ( storageFolder.isFile() ) {
+ throw new IllegalStateException( format( "Storage folder %s can not be a file, must be a directory",
+ storageFolder ) );
+ }
+
+ if ( !storageFolder.exists() ) {
+ if ( !storageFolder.mkdirs() ) {
+ throw new IllegalStateException(
+ format( "Storage folder %s can not be created, please verify you have enough permissions",
+ storageFolder ) );
}
+ }
+
+ final SiteCrawler siteCrawler = new SiteCrawler( storageFolder );
+ siteCrawler.setNumOfCrawlers( numCrawlers );
+ siteCrawler.setMaxPages( maxPages );
+ siteCrawler.setMaxDepth( maxDepth );
+ siteCrawler.setPolitenessDelay(politenessDelay);
+
+ siteCrawler.addListener(new CrawlerListener() {
+ @Override
+ public void visitedPage(Page page) {
+ final String pageURL = page.getWebURL().getURL();
+ System.err.println( format("Processing page: [%s]", pageURL) );
+
+ final ParseData parseData = page.getParseData();
+ if (parseData instanceof HtmlParseData) {
+ final HtmlParseData htmlParseData = (HtmlParseData) parseData;
+ try {
+ synchronized (roverLock) {
+ Crawler.super.performExtraction(
+ new StringDocumentSource(
+ htmlParseData.getHtml(),
+ pageURL
- siteCrawler.addListener(new CrawlerListener() {
- @Override
- public void visitedPage(Page page) {
- final String pageURL = page.getWebURL().getURL();
- System.err.println( String.format("Processing page: [%s]", pageURL) );
-
- final ParseData parseData = page.getParseData();
- if (parseData instanceof HtmlParseData) {
- final HtmlParseData htmlParseData = (HtmlParseData) parseData;
- try {
- synchronized (roverLock) {
- Crawler.super.performExtraction(
- new StringDocumentSource(
- htmlParseData.getHtml(),
- pageURL
-
- )
- );
- }
- } catch (Exception e) {
- System.err.println(
- String.format("Error while processing page [%s], error: %s .", pageURL, e.getMessage())
+ )
);
}
- }
- }
- });
-
- Runtime.getRuntime().addShutdownHook( new Thread() {
- @Override
- public void run() {
- try {
- System.err.println( Crawler.super.printReports() );
- // siteCrawler.stop(); // TODO: cause shutdown hanging.
} catch (Exception e) {
- e.printStackTrace();
+ System.err.println(format("Error while processing page [%s], error: %s .",
+ pageURL, e.getMessage())
+ );
}
}
- });
- siteCrawler.start(seed, pageFilter, true);
- return 0;
- } catch (Exception e) {
- if(super.isVerbose()) e.printStackTrace();
- if(e instanceof ExitCodeException) {
- return ((ExitCodeException) e).getExitCode();
}
- return 1;
- }
- }
+ });
- @Override
- protected Options createOptions() {
- final Options roverOptions = super.createOptions();
- addCrawlerOptions(roverOptions);
- return roverOptions;
+ Runtime.getRuntime().addShutdownHook( new Thread() {
+ @Override
+ public void run() {
+ try {
+ System.err.println( Crawler.super.printReports() );
+ // siteCrawler.stop(); // TODO: cause shutdown hanging.
+ } catch (Exception e) {
+ e.printStackTrace(System.err);
+ }
+ }
+ });
+ siteCrawler.start(seed, pageFilter, true);
}
- private void addCrawlerOptions(Options options) {
- options.addOption(
- new Option("pagefilter" , true, "Regex used to filter out page URLs during crawling. Default: '" + SiteCrawler.DEFAULT_PAGE_FILTER_RE + "'")
- );
- options.addOption(
- new Option("storagefolder" , true, "Folder used to store crawler temporary data. Default: [" + System.getProperty("java.io.tmpdir") + "]")
- );
- options.addOption(
- new Option("numcrawlers" , true, "Sets the number of crawlers. Default: " + SiteCrawler.DEFAULT_NUM_OF_CRAWLERS)
- );
- options.addOption(
- new Option("maxpages" , true, "Max number of pages before interrupting crawl. Default: no limit.")
- );
- options.addOption(
- new Option("maxdepth" , true, "Max allowed crawler depth. Default: no limit.")
- );
- options.addOption(
- new Option("politenessdelay", true, "Politeness delay in milliseconds. Default: no limit.")
- );
- }
+ public static final class PatterConverter implements IStringConverter<Pattern> {
- private Pattern getPageFilter(CommandLine commandLine) {
- if(commandLine.hasOption("pagefilter")) {
+ @Override
+ public Pattern convert( String value ) {
try {
- return Pattern.compile( commandLine.getOptionValue("pagefilter") );
+ return Pattern.compile( value );
} catch (PatternSyntaxException pse) {
- throw new ExitCodeException("Invalid page filter, must be a regular expression.", 6);
+ throw new ParameterException( format("Invalid page filter, '%s' must be a regular expression.", value) );
}
}
- return null;
- }
- private File getStorageFolder(CommandLine commandLine) throws IOException {
- if(commandLine.hasOption("storagefolder")) {
- final File candidate = new File( commandLine.getOptionValue("storagefolder") );
- if(candidate.exists() && candidate.isFile())
- throw new IllegalArgumentException("The storage folder must be a directory.");
- return candidate;
- } else {
- final File tmpDir = File.createTempFile("crawler-metadata-" + UUID.randomUUID().toString(), "db");
- tmpDir.delete();
- return tmpDir;
- }
- }
-
- private int parseInt(CommandLine cl, String option) {
- final String value = cl.getOptionValue(option);
- try {
- return Integer.parseInt(value);
- } catch (NumberFormatException nfe) {
- throw new IllegalArgumentException(String.format("Expected integer for %s found '%s' .", option, value));
- }
}
}
Modified: incubator/any23/trunk/plugins/basic-crawler/src/test/java/org/apache/any23/cli/CrawlerTest.java
URL: http://svn.apache.org/viewvc/incubator/any23/trunk/plugins/basic-crawler/src/test/java/org/apache/any23/cli/CrawlerTest.java?rev=1308786&r1=1308785&r2=1308786&view=diff
==============================================================================
--- incubator/any23/trunk/plugins/basic-crawler/src/test/java/org/apache/any23/cli/CrawlerTest.java (original)
+++ incubator/any23/trunk/plugins/basic-crawler/src/test/java/org/apache/any23/cli/CrawlerTest.java Tue Apr 3 09:40:03 2012
@@ -17,8 +17,6 @@
package org.apache.any23.cli;
-import static org.junit.Assert.*;
-
import org.apache.any23.Any23OnlineTestBase;
import org.apache.any23.rdf.RDFUtils;
import org.apache.any23.util.FileUtils;
@@ -36,6 +34,8 @@ import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
+import static org.junit.Assert.assertTrue;
+
/**
* Test case for {@link Crawler} CLI.
*
@@ -57,13 +57,17 @@ public class CrawlerTest extends Any23On
new Runnable() {
@Override
public void run() {
- Crawler.main(
- String.format(
- "-f nquads -maxpages 50 -maxdepth 1 -politenessdelay 500 -o %s " +
- "http://eventiesagre.it/",
- outFile.getAbsolutePath()
- ).split(" ")
- );
+ try {
+ ToolRunner.main(
+ String.format(
+ "crawler -f nquads --maxpages 50 --maxdepth 1 --politenessdelay 500 -o %s " +
+ "http://eventiesagre.it/",
+ outFile.getAbsolutePath()
+ ).split(" ")
+ );
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
}
}
);
@@ -80,7 +84,7 @@ public class CrawlerTest extends Any23On
final String[] lines = FileUtils.readFileLines(outFile);
final StringBuilder allLinesExceptLast = new StringBuilder();
- for(int i = 0; i < lines.length - 1; i++) {
+ for (int i = 0; i < lines.length - 1; i++) {
allLinesExceptLast.append(lines[i]);
}
Modified: incubator/any23/trunk/pom.xml
URL: http://svn.apache.org/viewvc/incubator/any23/trunk/pom.xml?rev=1308786&r1=1308785&r2=1308786&view=diff
==============================================================================
--- incubator/any23/trunk/pom.xml (original)
+++ incubator/any23/trunk/pom.xml Tue Apr 3 09:40:03 2012
@@ -209,6 +209,7 @@
<javac.target.version>1.6</javac.target.version>
<maven.build.timestamp.format>yyyy-MM-dd HH:mm:ssZ</maven.build.timestamp.format>
<implementation.build>${scmBranch}@r${buildNumber}</implementation.build>
+ <implementation.build.tstamp>${implementation.build}; ${maven.build.timestamp}</implementation.build.tstamp>
<maven.javadoc.plugin.version>2.8</maven.javadoc.plugin.version>
<slf4j.logger.version>1.5.6</slf4j.logger.version>
<sesame.version>2.6.1</sesame.version>
@@ -381,6 +382,12 @@
<version>1.1.0</version>
</dependency>
<!-- END: Plugins specific dependencies -->
+
+ <dependency>
+ <groupId>com.beust</groupId>
+ <artifactId>jcommander</artifactId>
+ <version>1.23</version>
+ </dependency>
</dependencies>
</dependencyManagement>