You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@any23.apache.org by mo...@apache.org on 2012/01/10 17:32:33 UTC

svn commit: r1229627 [5/5] - in /incubator/any23/trunk: ./ any23-core/ any23-core/bin/ any23-core/src/main/java/org/deri/any23/ any23-core/src/main/java/org/deri/any23/cli/ any23-core/src/main/java/org/deri/any23/eval/ any23-core/src/main/java/org/deri...

Added: incubator/any23/trunk/plugins/basic-crawler/src/main/java/org/deri/any23/cli/Crawler.java
URL: http://svn.apache.org/viewvc/incubator/any23/trunk/plugins/basic-crawler/src/main/java/org/deri/any23/cli/Crawler.java?rev=1229627&view=auto
==============================================================================
--- incubator/any23/trunk/plugins/basic-crawler/src/main/java/org/deri/any23/cli/Crawler.java (added)
+++ incubator/any23/trunk/plugins/basic-crawler/src/main/java/org/deri/any23/cli/Crawler.java Tue Jan 10 16:32:28 2012
@@ -0,0 +1,187 @@
+/*
+ * Copyright 2008-2010 Digital Enterprise Research Institute (DERI)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *          http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.deri.any23.cli;
+
+import edu.uci.ics.crawler4j.crawler.Page;
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.Options;
+import org.deri.any23.plugin.crawler.CrawlerListener;
+import org.deri.any23.plugin.crawler.SiteCrawler;
+import org.deri.any23.source.StringDocumentSource;
+
+import java.io.File;
+import java.io.IOException;
+import java.net.URL;
+import java.util.UUID;
+import java.util.regex.Pattern;
+import java.util.regex.PatternSyntaxException;
+
+/**
+ * Implementation of a <b>CLI crawler</b> based on
+ * {@link Rover}.
+ *
+ * @author Michele Mostarda (mostarda@fbk.eu)
+ */
+@ToolRunner.Description("Any23 Crawler Command Line Tool.")
+public class Crawler extends Rover {
+
+    private final Object roverLock = new Object();
+
+    public static void main(String[] args) {
+        try {
+            System.exit( new Crawler().run(args) );
+        } catch (Exception e) {
+            e.printStackTrace();
+        }
+    }
+
+    @Override
+    public int run(String[] args) {
+        try {
+            final String[] seeds = super.configure(args);
+            if(seeds.length != 1) throw new IllegalArgumentException("Expected just one seed.");
+            final URL seed = new URL(seeds[0]);
+
+            final CommandLine commandLine = super.getCommandLine();
+
+            final SiteCrawler siteCrawler = new SiteCrawler( getStorageFolder(commandLine) );
+
+            final Pattern specifiedPageFilter = getPageFilter(commandLine);
+            final Pattern pageFilter = specifiedPageFilter == null ? siteCrawler.defaultFilters : specifiedPageFilter;
+
+            if(commandLine.hasOption("numcrawlers")) {
+                siteCrawler.setNumOfCrawlers( parseInt(commandLine, "numcrawlers") );
+            }
+            if(commandLine.hasOption("maxpages")) {
+                siteCrawler.setMaxPages(parseInt(commandLine, "maxpages"));
+            }
+            if(commandLine.hasOption("maxdepth")) {
+                siteCrawler.setMaxDepth(parseInt(commandLine, "maxdepth"));
+            }
+            if (commandLine.hasOption("politenessdelay")) {
+                final int politenessDelay = parseInt(commandLine, "politenessdelay");
+                if(politenessDelay >= 0) siteCrawler.setPolitenessDelay(politenessDelay);
+            }
+
+            siteCrawler.addListener(new CrawlerListener() {
+                @Override
+                public void visitedPage(Page page) {
+                    final String pageURL = page.getWebURL().getURL();
+                    System.err.println( String.format("Processing page: [%s]", pageURL) );
+                    try {
+                        synchronized (roverLock) {
+                            Crawler.super.performExtraction(
+                                    new StringDocumentSource(
+                                            page.getHTML(),
+                                            pageURL
+
+                                    )
+                            );
+                        }
+                    } catch (Exception e) {
+                        System.err.println(
+                                String.format("Error while processing page [%s], error: %s .", pageURL, e.getMessage())
+                        );
+                    }
+                }
+            });
+
+            Runtime.getRuntime().addShutdownHook( new Thread() {
+                @Override
+                public void run() {
+                    try {
+                        System.err.println( Crawler.super.printReports() );
+                        // siteCrawler.stop(); // TODO: cause shutdown hanging.
+                    } catch (Exception e) {
+                        e.printStackTrace();
+                    }
+                }
+            });
+            siteCrawler.start(seed, pageFilter, true);
+            return 0;
+        } catch (Exception e) {
+            if(super.isVerbose()) e.printStackTrace();
+            if(e instanceof ExitCodeException) {
+                return ((ExitCodeException) e).getExitCode();
+            }
+            return 1;
+        }
+    }
+
+    @Override
+    protected Options createOptions() {
+        final Options roverOptions = super.createOptions();
+        addCrawlerOptions(roverOptions);
+        return roverOptions;
+    }
+
+    private void addCrawlerOptions(Options options) {
+        options.addOption(
+                new Option("pagefilter"     , true, "Regex used to filter out page URLs during crawling. Default: '" + SiteCrawler.DEFAULT_PAGE_FILTER_RE + "'")
+        );
+        options.addOption(
+                new Option("storagefolder"  , true, "Folder used to store crawler temporary data. Default: [" + System.getProperty("java.io.tmpdir")  + "]")
+        );
+        options.addOption(
+                new Option("numcrawlers"    , true, "Sets the number of crawlers. Default: " + SiteCrawler.DEFAULT_NUM_OF_CRAWLERS)
+        );
+        options.addOption(
+                new Option("maxpages"       , true, "Max number of pages before interrupting crawl. Default: no limit.")
+        );
+        options.addOption(
+                new Option("maxdepth"       , true, "Max allowed crawler depth. Default: no limit.")
+        );
+        options.addOption(
+                new Option("politenessdelay", true, "Politeness delay in milliseconds. Default: no limit.")
+        );
+    }
+
+    private Pattern getPageFilter(CommandLine commandLine) {
+        if(commandLine.hasOption("pagefilter")) {
+            try {
+                return Pattern.compile( commandLine.getOptionValue("pagefilter") );
+            } catch (PatternSyntaxException pse) {
+                throw new ExitCodeException("Invalid page filter, must be a regular expression.", 6);
+            }
+        }
+        return null;
+    }
+
+    private File getStorageFolder(CommandLine commandLine) throws IOException {
+        if(commandLine.hasOption("storagefolder")) {
+           final File candidate = new  File( commandLine.getOptionValue("storagefolder") );
+           if(candidate.exists() && candidate.isFile())
+               throw new IllegalArgumentException("The storage folder must be a directory.");
+            return candidate;
+        } else {
+            final File tmpDir = File.createTempFile("crawler-metadata-" + UUID.randomUUID().toString(), "db");
+            tmpDir.delete();
+            return tmpDir;
+        }
+    }
+
+    private int parseInt(CommandLine cl, String option) {
+        final String value = cl.getOptionValue(option);
+        try {
+            return Integer.parseInt(value);
+        } catch (NumberFormatException nfe) {
+            throw new IllegalArgumentException(String.format("Expected integer for %s found '%s' .", option, value));
+        }
+    }
+
+}

Added: incubator/any23/trunk/plugins/basic-crawler/src/main/java/org/deri/any23/plugin/crawler/CrawlerListener.java
URL: http://svn.apache.org/viewvc/incubator/any23/trunk/plugins/basic-crawler/src/main/java/org/deri/any23/plugin/crawler/CrawlerListener.java?rev=1229627&view=auto
==============================================================================
--- incubator/any23/trunk/plugins/basic-crawler/src/main/java/org/deri/any23/plugin/crawler/CrawlerListener.java (added)
+++ incubator/any23/trunk/plugins/basic-crawler/src/main/java/org/deri/any23/plugin/crawler/CrawlerListener.java Tue Jan 10 16:32:28 2012
@@ -0,0 +1,35 @@
+/*
+ * Copyright 2008-2010 Digital Enterprise Research Institute (DERI)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *          http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.deri.any23.plugin.crawler;
+
+import edu.uci.ics.crawler4j.crawler.Page;
+
+/**
+ * Defines a listener for a {@link SiteCrawler}.
+ *
+ * @author Michele Mostarda (mostarda@fbk.eu)
+ */
+public interface CrawlerListener {
+
+    /**
+     * Notifies to the listener that a page has been discovered.
+     *
+     * @param page the page data.
+     */
+    void visitedPage(Page page);
+
+}

Added: incubator/any23/trunk/plugins/basic-crawler/src/main/java/org/deri/any23/plugin/crawler/DefaultWebCrawler.java
URL: http://svn.apache.org/viewvc/incubator/any23/trunk/plugins/basic-crawler/src/main/java/org/deri/any23/plugin/crawler/DefaultWebCrawler.java?rev=1229627&view=auto
==============================================================================
--- incubator/any23/trunk/plugins/basic-crawler/src/main/java/org/deri/any23/plugin/crawler/DefaultWebCrawler.java (added)
+++ incubator/any23/trunk/plugins/basic-crawler/src/main/java/org/deri/any23/plugin/crawler/DefaultWebCrawler.java Tue Jan 10 16:32:28 2012
@@ -0,0 +1,67 @@
+/*
+ * Copyright 2008-2010 Digital Enterprise Research Institute (DERI)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *          http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.deri.any23.plugin.crawler;
+
+import edu.uci.ics.crawler4j.crawler.Page;
+import edu.uci.ics.crawler4j.crawler.WebCrawler;
+import edu.uci.ics.crawler4j.url.WebURL;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.regex.Pattern;
+
+/**
+ * Default {@link WebCrawler} implementation.
+ *
+ * @author Michele Mostarda (mostarda@fbk.eu)
+ */
+public class DefaultWebCrawler extends WebCrawler {
+
+    private static final Logger logger = LoggerFactory.getLogger(DefaultWebCrawler.class);
+
+    /**
+     * Shared data reference.
+     */
+    private final SharedData sharedData = SharedData.getInstance();
+
+    /**
+     * Page filter pattern.
+     */
+    private final Pattern pattern = sharedData.getPattern();
+
+    /**
+     * Override this method to specify whether the given URL should be visited or not.
+     */
+    @Override
+    public boolean shouldVisit(WebURL url) {
+        if (url.getURL() == null) return false;
+        final String href = url.getURL().toLowerCase();
+        if( ! href.startsWith( sharedData.getSeed() ) ) return false;
+        return pattern == null || ! pattern.matcher(href).matches();
+    }
+
+    /**
+     * Override this method to implement the single page processing logic.
+     */
+    @Override
+    public void visit(Page page) {
+        logger.trace("Visiting page: " + page.getWebURL().getURL());
+        sharedData.notifyPage(page);
+    }
+
+}
+

Added: incubator/any23/trunk/plugins/basic-crawler/src/main/java/org/deri/any23/plugin/crawler/SharedData.java
URL: http://svn.apache.org/viewvc/incubator/any23/trunk/plugins/basic-crawler/src/main/java/org/deri/any23/plugin/crawler/SharedData.java?rev=1229627&view=auto
==============================================================================
--- incubator/any23/trunk/plugins/basic-crawler/src/main/java/org/deri/any23/plugin/crawler/SharedData.java (added)
+++ incubator/any23/trunk/plugins/basic-crawler/src/main/java/org/deri/any23/plugin/crawler/SharedData.java Tue Jan 10 16:32:28 2012
@@ -0,0 +1,120 @@
+/*
+ * Copyright 2008-2010 Digital Enterprise Research Institute (DERI)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *          http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.deri.any23.plugin.crawler;
+
+import edu.uci.ics.crawler4j.crawler.Page;
+
+import java.util.List;
+import java.util.regex.Pattern;
+
+/**
+ * This class hosts shared data structures accessible
+ * to all the {@link DefaultWebCrawler} instances
+ * run by the {@link SiteCrawler}.
+ *
+ * @author Michele Mostarda (mostarda@fbk.eu)
+ */
+public class SharedData {
+
+    /**
+     * Singleton instance.
+     */
+    private static SharedData instance;
+
+    /**
+     * Crawl seed.
+     */
+    private final String seed;
+
+    /**
+     * Crawl page filter pattern.
+     */
+    private final Pattern pattern;
+
+    /**
+     * List of crawler listeners.
+     */
+    private final List<CrawlerListener> listeners;
+
+//    /**
+//     * Output triple handler.
+//     */
+//    private final TripleHandler tripleHandler;
+
+    /**
+     * @return the singleton instance.
+     */
+    protected static SharedData getInstance() {
+        if(instance == null) throw new IllegalStateException("The configuration has not yet initialized.");
+        return instance;
+    }
+
+    /**
+     * Initializes the crawler data.
+     *
+     * @param seed crawler seed.
+     * @param regex page filter regex.
+     * @param listeners the listeners to be notified of the crawler activity.
+     */
+    protected static void setCrawlData(String seed, Pattern regex, List<CrawlerListener> listeners) {
+        instance = new SharedData(seed, regex, listeners);
+    }
+
+    /**
+     * Internal constructor.
+     *
+     * @param seed
+     * @param pattern
+     * @param listeners
+     */
+    private SharedData(String seed, Pattern pattern, List<CrawlerListener> listeners) {
+        if(seed == null || seed.trim().length() == 0)
+            throw new IllegalArgumentException(
+                String.format("Invalid seed '%s'", seed)
+            );
+
+        this.seed      = seed;
+        this.pattern   = pattern;
+        this.listeners = listeners;
+    }
+
+    /**
+     * @return crawl seed.
+     */
+    protected String getSeed() {
+        return seed;
+    }
+
+    /**
+     * @return page filter pattern.
+     */
+    protected Pattern getPattern() {
+        return pattern;
+    }
+
+    /**
+     * Notifies all listeners that a page has been discovered.
+     *
+     * @param page the discovered page.
+     */
+    protected void notifyPage(Page page) {
+        for(CrawlerListener listener : listeners) {
+            listener.visitedPage(page);
+        }
+    }
+
+}

Added: incubator/any23/trunk/plugins/basic-crawler/src/main/java/org/deri/any23/plugin/crawler/SiteCrawler.java
URL: http://svn.apache.org/viewvc/incubator/any23/trunk/plugins/basic-crawler/src/main/java/org/deri/any23/plugin/crawler/SiteCrawler.java?rev=1229627&view=auto
==============================================================================
--- incubator/any23/trunk/plugins/basic-crawler/src/main/java/org/deri/any23/plugin/crawler/SiteCrawler.java (added)
+++ incubator/any23/trunk/plugins/basic-crawler/src/main/java/org/deri/any23/plugin/crawler/SiteCrawler.java Tue Jan 10 16:32:28 2012
@@ -0,0 +1,272 @@
+/*
+ * Copyright 2008-2010 Digital Enterprise Research Institute (DERI)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *          http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.deri.any23.plugin.crawler;
+
+import edu.uci.ics.crawler4j.crawler.CrawlController;
+import edu.uci.ics.crawler4j.crawler.WebCrawler;
+
+import java.io.File;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.regex.Pattern;
+
+/**
+ * A basic <em>site crawler</em> to extract semantic content
+ * of small/medium size sites.
+ *
+ * @author Michele Mostarda (mostarda@fbk.eu)
+ */
+public class SiteCrawler {
+
+    public static final String DEFAULT_PAGE_FILTER_RE =
+        ".*(\\.(" +
+                    "css|js"                            +
+                    "|bmp|gif|jpe?g|png|tiff?"          +
+                    "|mid|mp2|mp3|mp4|wav|wma"          +
+                    "|avi|mov|mpeg|ram|m4v|wmv|rm|smil" +
+                    "|pdf"        +
+                    "|swf"        +
+                    "|zip|rar|gz" +
+                    "|xml|txt"    +
+        "))$";
+
+    /**
+     * Default number of crawler instances.
+     */
+    public static final int DEFAULT_NUM_OF_CRAWLERS = 10;
+
+    /**
+     * Default crawler implementation.
+     */
+    public static final Class<? extends WebCrawler> DEFAULT_WEB_CRAWLER = DefaultWebCrawler.class;
+
+    /**
+     * Default filter applied to skip contents.
+     */
+    public final Pattern defaultFilters = Pattern.compile(DEFAULT_PAGE_FILTER_RE);
+
+    /**
+     * The crawler threads controller.
+     */
+    private final CrawlController controller;
+
+    /**
+     * Crawler listeners.
+     */
+    private final List<CrawlerListener> listeners = new ArrayList<CrawlerListener>();
+
+    /**
+     * Actual number of crawler instances.
+     */
+    private int numOfCrawlers = DEFAULT_NUM_OF_CRAWLERS;
+
+    /**
+     * Actual web crawler.
+     */
+    private Class<? extends WebCrawler> webCrawler = DEFAULT_WEB_CRAWLER;
+
+    /**
+     * Max allowed depth, <code>-1</code> means no limit.
+     */
+    private int maxDepth = -1;
+
+    /**
+     *  Max allowed pages, <code>-1</code> means no limit.
+     */
+    private int maxPages = -1;
+
+    /**
+     * Subsequent call politeness delay, <code>-1</code> means no limit.
+     */
+    private int politenessDelay = -1;
+
+    /**
+     * Internal executor service.
+     */
+    private ExecutorService service;
+
+    /**
+     * Constructor.
+     *
+     * @param storageFolder location used to store the temporary data structures used by the crawler.
+     */
+    public SiteCrawler(File storageFolder) {
+        try {
+            controller = new CrawlController( storageFolder.getAbsolutePath() );
+        } catch (Exception e) {
+            throw new IllegalArgumentException("Error while initializing crawler controller.", e);
+        }
+    }
+
+    /**
+     * @return number of crawler instances.
+     */
+    public int getNumOfCrawlers() {
+        return numOfCrawlers;
+    }
+
+    /**
+     * Sets the number of crawler instances.
+     *
+     * @param n an integer &gt;= 0.
+     */
+    public void setNumOfCrawlers(int n) {
+        if(n <=0) throw new IllegalArgumentException("Invalid number of crawlers, must be > 0 .");
+        this.numOfCrawlers = n;
+    }
+
+    public Class<? extends WebCrawler> getWebCrawler() {
+        return webCrawler;
+    }
+
+    /**
+     * Sets the actual crawler clas.
+     *
+     * @param c a not <code>class</code>.
+     */
+    public void setWebCrawler(Class<? extends WebCrawler> c) {
+        if(c == null) throw new NullPointerException("c cannot be null.");
+        this.webCrawler = c;
+    }
+
+    /**
+     * @return the max allowed crawl depth, <code>-1</code> means no limit.
+     */
+    public int getMaxDepth() {
+        return maxDepth;
+    }
+
+    /**
+     * Sets the maximum depth.
+     *
+     * @param maxDepth maximum allowed depth. <code>-1</code> means no limit.
+     */
+    public void setMaxDepth(int maxDepth) {
+        if(maxDepth < -1 || maxDepth == 0) throw new IllegalArgumentException("Invalid maxDepth, must be -1 or > 0");
+        if(maxDepth > 0) try {
+            controller.setMaximumCrawlDepth(maxDepth);
+        } catch (Exception e) {
+            throw new IllegalArgumentException("Error while setting maxDepth.", e);
+        }
+        this.maxDepth = maxDepth;
+    }
+
+    /**
+     * @return max number of allowed pages.
+     */
+    public int getMaxPages() {
+        return maxPages;
+    }
+
+    /**
+     * Sets the maximum collected pages.
+     *
+     * @param maxPages maximum allowed pages. <code>-1</code> means no limit.
+     */
+    public void setMaxPages(int maxPages) {
+        if(maxPages < -1 || maxPages == 0) throw new IllegalArgumentException("Invalid maxPages, must be -1 or > 0");
+        if(maxPages > 0) controller.setMaximumPagesToFetch(maxPages);
+        this.maxPages = maxPages;
+    }
+
+    /**
+     * @return the politeness delay in milliseconds.
+     */
+    public int getPolitenessDelay() {
+        return politenessDelay;
+    }
+
+    /**
+     * Sets the politeness delay. <code>-1</code> means no politeness.
+     *
+     * @param millis delay in milliseconds.
+     */
+    public void setPolitenessDelay(int millis) {
+        if(millis < -1) throw new IllegalArgumentException("Invalid politenessDelay, must be >= -1");
+        if(millis >= 0) controller.setPolitenessDelay(millis);
+        this.politenessDelay = millis;
+    }
+
+    /**
+     * Registers a {@link CrawlerListener} to this crawler.
+     *
+     * @param listener
+     */
+    public void addListener(CrawlerListener listener) {
+        listeners.add(listener);
+    }
+
+    /**
+     * Deregisters a {@link CrawlerListener} from this crawler.
+     *
+     * @param listener
+     */
+    public void removeListener(CrawlerListener listener) {
+        listeners.remove(listener);
+    }
+
+    /**
+     * Starts the crawling process.
+     *
+     * @param seed the starting URL for the crawler process.
+     * @param filters filters to be applied to the crawler process. Can be <code>null</code>.
+     * @param wait if <code>true</code> the process will wait for the crawler termination.
+     * @throws Exception
+     */
+    public synchronized void start(
+            final URL seed, final Pattern filters, final boolean wait
+    ) throws Exception {
+        SharedData.setCrawlData(seed.toExternalForm(), filters, Collections.synchronizedList(listeners) );
+        controller.addSeed(seed.toExternalForm());
+        final Runnable internalRunnable = new Runnable() {
+            @Override
+            public void run() {
+                controller.start(getWebCrawler(), getNumOfCrawlers());
+            }
+        };
+        if(wait) {
+            internalRunnable.run();
+        } else {
+            if(service != null) throw new IllegalStateException("Another service seems to run.");
+            service = Executors.newSingleThreadExecutor();
+            service.execute(internalRunnable);
+        }
+    }
+
+    /**
+     * Starts the crawler process with the {@link #defaultFilters}.
+     *
+     * @param seed the starting URL for the crawler process.
+     * @param wait if <code>true</code> the process will wait for the crawler termination.
+     * @throws Exception
+     */
+    public void start(final URL seed, final boolean wait) throws Exception {
+        start(seed, defaultFilters, wait);
+    }
+
+    /**
+     * Interrupts the crawler process if started with <code>wait</code> flag == <code>false</code>.
+     */
+    public synchronized void stop() {
+        service.shutdownNow();
+    }
+
+}

Added: incubator/any23/trunk/plugins/basic-crawler/src/main/java/org/deri/any23/plugin/crawler/package-info.java
URL: http://svn.apache.org/viewvc/incubator/any23/trunk/plugins/basic-crawler/src/main/java/org/deri/any23/plugin/crawler/package-info.java?rev=1229627&view=auto
==============================================================================
--- incubator/any23/trunk/plugins/basic-crawler/src/main/java/org/deri/any23/plugin/crawler/package-info.java (added)
+++ incubator/any23/trunk/plugins/basic-crawler/src/main/java/org/deri/any23/plugin/crawler/package-info.java Tue Jan 10 16:32:28 2012
@@ -0,0 +1,4 @@
+/**
+ * This package defines a handy self contained Web Crawler.
+ */
+package org.deri.any23.plugin.crawler;
\ No newline at end of file

Added: incubator/any23/trunk/plugins/basic-crawler/src/test/java/org/deri/any23/Any23OnlineTestBase.java
URL: http://svn.apache.org/viewvc/incubator/any23/trunk/plugins/basic-crawler/src/test/java/org/deri/any23/Any23OnlineTestBase.java?rev=1229627&view=auto
==============================================================================
--- incubator/any23/trunk/plugins/basic-crawler/src/test/java/org/deri/any23/Any23OnlineTestBase.java (added)
+++ incubator/any23/trunk/plugins/basic-crawler/src/test/java/org/deri/any23/Any23OnlineTestBase.java Tue Jan 10 16:32:28 2012
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2004-2005 The Apache Software Foundation.
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ */
+
+package org.deri.any23;
+
+import org.junit.Assume;
+
+/**
+ * Base class for any <code>Any23</code> test class containing online tests
+ * (test which require online resources to run).
+ * This class excluded all online tests if JVM flag {@link #ONLINE_TEST_DISABLED_FLAG} is specified.
+ *
+ * @author Michele Mostarda (mostarda@fbk.eu)
+ */
+// TODO: this class has been duplicated from any23-core test classpath.
+public abstract class Any23OnlineTestBase {
+
+    public static final String ONLINE_TEST_DISABLED_FLAG = "any23.online.test.disabled";
+
+    /**
+     * Check whether or not running online tests.
+     */
+    public static void assumeOnlineAllowed() {
+        Assume.assumeTrue(System.getProperty(ONLINE_TEST_DISABLED_FLAG, null) == null);
+    }
+
+}

Added: incubator/any23/trunk/plugins/basic-crawler/src/test/java/org/deri/any23/cli/CrawlerTest.java
URL: http://svn.apache.org/viewvc/incubator/any23/trunk/plugins/basic-crawler/src/test/java/org/deri/any23/cli/CrawlerTest.java?rev=1229627&view=auto
==============================================================================
--- incubator/any23/trunk/plugins/basic-crawler/src/test/java/org/deri/any23/cli/CrawlerTest.java (added)
+++ incubator/any23/trunk/plugins/basic-crawler/src/test/java/org/deri/any23/cli/CrawlerTest.java Tue Jan 10 16:32:28 2012
@@ -0,0 +1,85 @@
+/*
+ * Copyright 2008-2010 Digital Enterprise Research Institute (DERI)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *          http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.deri.any23.cli;
+
+import org.deri.any23.Any23OnlineTestBase;
+import org.deri.any23.rdf.RDFUtils;
+import org.deri.any23.util.FileUtils;
+import org.junit.Assert;
+import org.junit.Test;
+import org.openrdf.model.Statement;
+import org.openrdf.rio.RDFHandlerException;
+import org.openrdf.rio.RDFParseException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
+
+/**
+ * Test case for {@link Crawler} CLI.
+ *
+ * @author Michele Mostarda (mostarda@fbk.eu)
+ */
+public class CrawlerTest extends Any23OnlineTestBase {
+
+    public static final Logger logger = LoggerFactory.getLogger(CrawlerTest.class);
+
+    @Test
+    public void testCLI() throws IOException, RDFHandlerException, RDFParseException {
+        assumeOnlineAllowed();
+
+        final File outFile = File.createTempFile("crawler-test", ".nq");
+        outFile.delete();
+        logger.debug( "Outfile: " + outFile.getAbsolutePath() );
+
+        final Future future = Executors.newSingleThreadExecutor().submit(
+            new Runnable() {
+                @Override
+                public void run() {
+                    Crawler.main(
+                            String.format(
+                                    "-f nquads -maxpages 50 -maxdepth 1 -politenessdelay 500 -o %s " +
+                                    "http://eventiesagre.it/",
+                                    outFile.getAbsolutePath()
+                            ).split(" ")
+                    );
+                }
+            }
+        );
+
+        try {
+            future.get(10, TimeUnit.SECONDS);
+        } catch (Exception e) {
+            // OK.
+        }
+        Assert.assertTrue("The output file has not been created.", outFile.exists());
+
+        final String[] lines = FileUtils.readFileLines(outFile);
+        final StringBuilder allLinesExceptLast = new StringBuilder();
+        for(int i = 0; i < lines.length - 1; i++) {
+            allLinesExceptLast.append(lines[i]);
+        }
+
+        final Statement[] statements = RDFUtils.parseRDF(RDFUtils.Parser.NQuads, allLinesExceptLast.toString());
+        Assert.assertTrue(statements.length > 0);
+    }
+
+}

Added: incubator/any23/trunk/plugins/basic-crawler/src/test/java/org/deri/any23/plugin/crawler/SiteCrawlerTest.java
URL: http://svn.apache.org/viewvc/incubator/any23/trunk/plugins/basic-crawler/src/test/java/org/deri/any23/plugin/crawler/SiteCrawlerTest.java?rev=1229627&view=auto
==============================================================================
--- incubator/any23/trunk/plugins/basic-crawler/src/test/java/org/deri/any23/plugin/crawler/SiteCrawlerTest.java (added)
+++ incubator/any23/trunk/plugins/basic-crawler/src/test/java/org/deri/any23/plugin/crawler/SiteCrawlerTest.java Tue Jan 10 16:32:28 2012
@@ -0,0 +1,75 @@
+/*
+ * Copyright 2008-2010 Digital Enterprise Research Institute (DERI)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *          http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.deri.any23.plugin.crawler;
+
+import edu.uci.ics.crawler4j.crawler.Page;
+import org.deri.any23.Any23OnlineTestBase;
+import org.junit.Assert;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.File;
+import java.net.URL;
+import java.util.HashSet;
+import java.util.Set;
+
+/**
+ * Test case for {@link SiteCrawler}.
+ *
+ * @author Michele Mostarda (mostarda@fbk.eu)
+ */
+public class SiteCrawlerTest extends Any23OnlineTestBase {
+
+    public static final Logger logger = LoggerFactory.getLogger(SiteCrawlerTest.class);
+
+    /**
+     * Tests the main crawler use case.
+     *
+     * @throws Exception
+     */
+    @Test
+    public void testSiteCrawling() throws Exception {
+        assumeOnlineAllowed();
+
+        File tmpFile = File.createTempFile("site-crawler-test", ".storage");
+        tmpFile.delete();
+
+        final SiteCrawler controller = new SiteCrawler(tmpFile);
+        controller.setMaxPages(100);
+        controller.setPolitenessDelay(500);
+
+        final Set<String> distinctPages = new HashSet<String>();
+        controller.addListener(new CrawlerListener() {
+            @Override
+            public void visitedPage(Page page) {
+                distinctPages.add( page.getWebURL().getURL() );
+            }
+        });
+
+        controller.start( new URL("http://schema.org/"), false);
+
+        synchronized (this) {
+            this.wait(15 * 1000);
+        }
+        controller.stop();
+
+        logger.debug("Crawled pages: " + distinctPages.size());
+        Assert.assertTrue("Expected some page crawled.", distinctPages.size() > 0);
+    }
+
+}

Modified: incubator/any23/trunk/plugins/integration-test/src/test/java/org/deri/any23/plugin/PluginIT.java
URL: http://svn.apache.org/viewvc/incubator/any23/trunk/plugins/integration-test/src/test/java/org/deri/any23/plugin/PluginIT.java?rev=1229627&r1=1229626&r2=1229627&view=diff
==============================================================================
--- incubator/any23/trunk/plugins/integration-test/src/test/java/org/deri/any23/plugin/PluginIT.java (original)
+++ incubator/any23/trunk/plugins/integration-test/src/test/java/org/deri/any23/plugin/PluginIT.java Tue Jan 10 16:32:28 2012
@@ -32,7 +32,7 @@ import java.io.IOException;
  */
 public class PluginIT {
 
-    private static final int NUM_OF_EXTRACTORS = 22;
+    private static final int NUM_OF_EXTRACTORS = 23;
 
     private static final String PLUGIN_LOCATION = "target/plugins-build/";
 

Modified: incubator/any23/trunk/pom.xml
URL: http://svn.apache.org/viewvc/incubator/any23/trunk/pom.xml?rev=1229627&r1=1229626&r2=1229627&view=diff
==============================================================================
--- incubator/any23/trunk/pom.xml (original)
+++ incubator/any23/trunk/pom.xml Tue Jan 10 16:32:28 2012
@@ -86,7 +86,7 @@
         <compiler.version>1.6</compiler.version>
         <maven.javadoc.plugin.version>2.8</maven.javadoc.plugin.version>
         <slf4j.logger.version>1.5.6</slf4j.logger.version>
-        <sesame.version>2.4.0</sesame.version>
+        <sesame.version>2.6.1</sesame.version>
     </properties>
 
     <!-- Project repository configuration. -->
@@ -280,6 +280,11 @@
             </dependency>
             <dependency>
                 <groupId>org.openrdf.sesame</groupId>
+                <artifactId>sesame-rio-trix</artifactId>
+                <version>${sesame.version}</version>
+            </dependency>
+            <dependency>
+                <groupId>org.openrdf.sesame</groupId>
                 <artifactId>sesame-repository-sail</artifactId>
                 <version>${sesame.version}</version>
             </dependency>
@@ -670,6 +675,7 @@
 
     <modules>
         <module>any23-core</module>
+        <module>plugins/basic-crawler</module>
         <module>plugins/html-scraper</module>
         <module>plugins/office-scraper</module>
         <module>plugins/integration-test</module>

Modified: incubator/any23/trunk/src/site/apt/any23-plugins.apt
URL: http://svn.apache.org/viewvc/incubator/any23/trunk/src/site/apt/any23-plugins.apt?rev=1229627&r1=1229626&r2=1229627&view=diff
==============================================================================
--- incubator/any23/trunk/src/site/apt/any23-plugins.apt (original)
+++ incubator/any23/trunk/src/site/apt/any23-plugins.apt Tue Jan 10 16:32:28 2012
@@ -2,38 +2,57 @@ Any23 Plugins
 
 * Introduction
 
-    This section describes the <Any23> initial support for plugins.
+    This section describes the <Any23> plugins support.
 
-    <Any23> cames with a set of predefined plugins. Plugins are located
-    under <any23-root>/plugins dir.
+    <Any23> comes with a set of predefined plugins.
+    Such plugins are located under the <any23-root>/<<plugins>> dir.
 
-    Every plugin is a standard Maven2 module implementing at least the <ExtractorPlugin> interface.
+    A plugin is a standard <Maven3> module containing any implementation of
 
-    Currently it is possible to add only new <Extractor>s.
+    * {{{./xref/org/deri/any23/plugin/ExtractorPlugin.html}ExtractorPlugin}}
+
+    * {{{./xref/org/deri/any23/cli/Tool.html}Tool}}
+
+* How to Register a Plugin
+
+   A plugin can be added to <Any23> by:
+
+   * adding its <JAR> to the <Any23> <JVM classpath>;
+
+   * adding its <JAR> to the <$HOME/.any23/plugins> directory.
+
+   TODO: plugin support in CLI
+
+   TODO: plugin support in library
+
+   TODO: plugin support in Any23 Service
+
+    Any implementation of <ExtractorPlugin> will automatically registered to the
+    {{{./xref/org/deri/any23/extractor/ExtractorRegistry.html}ExtractorRegistry}}.
+
+    Any detected implementation of <Tool> will be listed by the <ToolRunner>
+    command-line tool in <any23-root/><<bin/any23tools>> .
 
 * How to Build a Plugin
 
-   <Any23> takes care to build and test plugins when distributed from its reactor pom.
+   <Any23> takes care to <test> and <package> plugins when distributed from its reactor <POM>.
    It is aways possible to rebuild a plugin using the command:
 
-+--------------------------------------
-  <plugin-dir>$ mvn clean compile
-+--------------------------------------
++------------------------------------------
+  <plugin-dir>$ mvn clean assembly:assembly
++------------------------------------------
 
-* How to Add a Plugin
+* How to Write an Extractor Plugin
 
-   A plugin can be added to <Any23> simply adding its JAR to the Any23 classpath.
-   <Any23> will auto detect the plugin and will register it to the extractors list.
+   An <Extractor Plugin> is a class:
 
-* How to Write a Plugin
+   * implementing the {{{./xref/org/deri/any23/plugin/ExtractorPlugin.html}ExtractorPlugin}} interface;
 
-   Currently only Extractors can be defined as plugin. To declare a new plugin it
-   is needed to implement the {{{ExtractorPlugin}}} interface and to annotate the
-   implementation with the {{{net.xeoh.plugins.base.annotations.@PluginImplementation}}}
-   annotation. An example of plugin is defined below.
+   * packaged under <<org.deri.any23.plugin>> .
+
+   An example of plugin is defined below.
 
 +--------------------------------------
-  @PluginImplementation
   @Author(name="Michele Mostarda (mostarda@fbk.eu)")
   public class HTMLScraperPlugin implements ExtractorPlugin {
 
@@ -56,11 +75,59 @@ Any23 Plugins
   }
 +--------------------------------------
 
-* Available Plugins
+* How to Write a Tool Plugin
+
+   An <Tool Plugin> is a class:
+
+   * implementing the {{{./xref/org/deri/any23/cli/Tool.html}Tool}} interface;
+
+   * packaged under <<org.deri.any23.cli>> .
+
+   An example of plugin is defined below.
+
++--------------------------------------
+@ToolRunner.Description("Prints out the current library version and configuration information.")
+public class Version implements Tool {
+
+    public static void main(String[] args) {
+        System.exit( new Version().run(args) );
+    }
+
+    public int run(String[] args) {
+        final String version = Any23.VERSION;
+        if(version == null) {
+            System.err.println("Error while retrieving configuration info.");
+            return 1;
+        }
+        System.out.println(String.format("Any23 Core v. %s", version));
+        System.out.println();
+        return 0;
+    }
+
+}
++--------------------------------------
+
+* Available Extractor Plugins
+
+  * HTML Scraper Plugin
+
+    The <HTMLScraperPlugin> is able to scrape plain text content from any HTML page
+    and transform it into statement literals.
+
+    This plugin is documented {{{./plugin-html-scraper.html}here}}.
+
+  * Office Scraper Plugins
+
+    The <Office Scraper Plugins> allow to extract semantic content from several
+    <Microsoft Office> document formats.
+
+    These plugins are documented {{{./plugin-office-scraper.html}here}}.
 
-  * <HTMLScraperPlugin>
+* Available CLI Tool Plugins
 
-    The HTMLScraperPlugin is able to extract significant text from any HTML page
-    and transform it to a literal.
+  * Crawler CLI Tool
 
-    The plugin is documented {{{./plugin-html-scraper.html}here}}.
\ No newline at end of file
+    The {{{./xref/org/deri/any23/cli/Crawler.html}Crawler CLI Tool}} is an extension of the
+    {{{./xref/org/deri/any23/cli/Rover.html}Rover CLI Tool}} to add site crawling basic
+    capabilities. More information about the <CLI> can be found at
+    {{{./getting-started.html#crawler-tool}Getting Started - Crawler Tool}} section.

Modified: incubator/any23/trunk/src/site/apt/dev-data-conversion.apt
URL: http://svn.apache.org/viewvc/incubator/any23/trunk/src/site/apt/dev-data-conversion.apt?rev=1229627&r1=1229626&r2=1229627&view=diff
==============================================================================
--- incubator/any23/trunk/src/site/apt/dev-data-conversion.apt (original)
+++ incubator/any23/trunk/src/site/apt/dev-data-conversion.apt Tue Jan 10 16:32:28 2012
@@ -6,43 +6,57 @@ Data Conversion
                              "@prefix : <http://other.example.org/ns#> ." +
                              "foo:bar foo: : .                          " +
                              ":bar : foo:bar .                           ";
-// The second argument of StringDocumentSource() must be a valid URI.
+//    The second argument of StringDocumentSource() must be a valid URI.
 /*3*/ DocumentSource source = new StringDocumentSource(content, "http://host.com/service");
 /*4*/ ByteArrayOutputStream out = new ByteArrayOutputStream();
 /*5*/ TripleHandler handler = new NTriplesWriter(out);
-/*6*/ runner.extract(source, handler);
-/*7*/ String n3 = out.toString("UTF-8");
+      try {
+/*6*/     runner.extract(source, handler);
+      } finally {
+/*7*/     handler.close();
+      }
+/*8*/ String nt = out.toString("UTF-8");
 +----------------------------------------------------------------------------------------------
 
  This example aims to demonstrate how to use <<Any23>> to perform RDF data conversion.
- In this code we provide some input data expressed as Turtle and convert it in N3 format.
+ In this code we provide some input data expressed as <<Turtle>> and convert it in <<NTriples>> format.
 
- At <<row 1>> we define a new instance of the <<Any23>> facade, that provides all the methods
+ At <<line 1>> we define a new instance of the <<Any23>> facade, that provides all the methods
  useful for the transformation. The facade constructor accepts a list of extractor names, if specified
  the extraction will be done only over this list, otherwise the data <MIME Type> will detected and will be applied
- all the compatible extractors declared within the {{{./xref/org/deri/any23/extractor/ExtractorRegistry.html}ExtractorRegistry}}.
+ all the compatible extractors declared within the
+ {{{./xref/org/deri/any23/extractor/ExtractorRegistry.html}ExtractorRegistry}}.
 
- The <<row 2>> defines the input string containing some {{{http://www.w3.org/TeamSubmission/turtle/}Turtle}} data.
+ The <<line 2>> defines the input string containing some {{{http://www.w3.org/TeamSubmission/turtle/}Turtle}} data.
 
- At <<row 3>> we instantiate a {{{./xref/org/deri/any23/source/StringDocumentSource.html}StringDocumentSource}},
+ At <<line 3>> we instantiate a {{{./xref/org/deri/any23/source/StringDocumentSource.html}StringDocumentSource}},
   specifying a content and a the source <URI>.
  The <URI> should be the source of the content data, and must be valid.
  Besides the {{{./xref/org/deri/any23/source/StringDocumentSource.html}StringDocumentSource}},
  you can also provide input from other sources, such as <HTTP> requests
  and local files. See the classes in the sources {{{./xref/org/deri/any23/source/package-summary.html}package}}.
 
- The <<row 4>> defines a buffered output stream that will be used to store the data produced by the
- writer declared at <<row 5>>.
+ The <<line 4>> defines a buffered output stream that will be used to store the data produced by the
+ writer declared at <<line 5>>.
 
- A writer stores the extracted triples in some destination. We use an {{{./xref/org/deri/any23/writer/NTriplesWriter.html}NTriplesWriter}} here that writes into a ByteArrayOutputStream. There are writers for a number of formats, and you can also store the triples directly into a Sesame repository to query them with SPARQL; see {{{./xref/org/deri/any23/writer/RepositoryWriter.html}RepositoryWriter}} and the writer {{{./xref/org/deri/any23/writer/package-summary.html}package}}.
+ A writer stores the extracted triples in some destination.
+ We use an {{{./xref/org/deri/any23/writer/NTriplesWriter.html}NTriplesWriter}} here that writes
+ into a <<ByteArrayOutputStream>>. The main <<RDF>> formats writers are available and it is possible also to store
+ the triples directly into a <<Sesame>> repository to query them via <<SPARQL>>.
+ See {{{./xref/org/deri/any23/writer/RepositoryWriter.html}RepositoryWriter}} and the writer
+ {{{./xref/org/deri/any23/writer/package-summary.html}package}}.
+
+ The extractor method invoked at <<line 6>> performs the metadata extraction.
+ This method accepts as first argument a {{{./xref/org/deri/any23/source/DocumentSource.html}DocumentSource}} and as
+ second argument a {{{./xref/org/deri/any23/writer/TripleHandler.html}TripleHandler}},
+ that will receive the sequence parsing events generated by the applied extractors. The extract method defines also
+ another signature where it is possible to specify a charset encoding for the input data. If <<null>>, the charset
+ will be auto detected.
 
- The extractor method invoked at <<row 6>> performs the metadata extraction.
- This method accepts as first argument a {{{./xref/org/deri/any23/source/DocumentSource.html}DocumentSource}} and as second argument a {{{./xref/org/deri/any23/writer/TripleHandler.html}TripleHandler}},
- that will receive the sequence parsing events generated by the applied extractors. The extract method defines also another
- signature where it is possible to specify a charset encoding for the input data. If null, the charset will be
- auto detected.
+ The {{{./xref/org/deri/any23/writer/TripleHandler.html}TripleHandler}} needs to be explicitly closed,
+ this is done safely in a <<finally>> block at <<line 7>>.
 
- The expected output is <UTF-8> encoded at <<row 7>>:
+ The expected output is <UTF-8> encoded at <<line 8>>:
 
 +----------------------------------------------------------------------------------------------
 <http://example.org/ns#bar> <http://example.org/ns#> <http://other.example.org/ns#> .

Modified: incubator/any23/trunk/src/site/apt/dev-data-extraction.apt
URL: http://svn.apache.org/viewvc/incubator/any23/trunk/src/site/apt/dev-data-extraction.apt?rev=1229627&r1=1229626&r2=1229627&view=diff
==============================================================================
--- incubator/any23/trunk/src/site/apt/dev-data-extraction.apt (original)
+++ incubator/any23/trunk/src/site/apt/dev-data-extraction.apt Tue Jan 10 16:32:28 2012
@@ -10,30 +10,37 @@ Data Extraction
       );
 /*5*/ ByteArrayOutputStream out = new ByteArrayOutputStream();
 /*6*/ TripleHandler handler = new NTriplesWriter(out);
-/*7*/ runner.extract(source, handler);
-/*8*/ String n3 = out.toString("UTF-8");
+      try {
+/*7*/     runner.extract(source, handler);
+      } finally {
+/*8*/     handler.close();
+      }
+/*9*/ String n3 = out.toString("UTF-8");
 +----------------------------------------------------------------------------------------------
 
-   This second example demonstrates the data extraction, that is the main purpose of <<Any23>> library.
-   At <<row 1>> we define the <<Any23>> facade instance. As described before, the constructor allows to enforce
+   This example demonstrates the data extraction, that is the main purpose of <<Any23>> library.
+   At <<line 1>> we define the <<Any23>> facade instance. As described before, the constructor allows to enforce
    the usage of specific extractors.
 
-   The <<row 2>> defines the <HTTP User Agent>, used to identify the client during <HTTP> data collection.
-   At <<row 3>> we use the runner to create an instance of {{{./xref/org/deri/any23/http/HTTPClient.html}HTTPClient}},
+   The <<line 2>> defines the <HTTP User Agent>, used to identify the client during <HTTP> data collection.
+   At <<line 3>> we use the runner to create an instance of {{{./xref/org/deri/any23/http/HTTPClient.html}HTTPClient}},
    used by {{{./xref/org/deri/any23/source/HTTPDocumentSource.html}HTTPDocumentSource}} for <HTTP> content fetching.
 
-   The <<row 4>> instantiates an {{{./xref/org/deri/any23/source/HTTPDocumentSource.html}HTTPDocumentSource}} instance,
+   The <<line 4>> instantiates an {{{./xref/org/deri/any23/source/HTTPDocumentSource.html}HTTPDocumentSource}} instance,
    specifying the {{{./xref/org/deri/any23/http/HTTPClient.html}HTTPClient}} and the URL addressing the content
    to be processed.
 
-   At <<row 5>> we define a buffered output stream used to store data produced by the
-   {{{./xref/org/deri/any23/writer/TripleHandler.html}TripleHandler}} defined at <<row 6>>.
+   At <<line 5>> we define a buffered output stream used to store data produced by the
+   {{{./xref/org/deri/any23/writer/TripleHandler.html}TripleHandler}} defined at <<line 6>>.
 
-   The extraction method at <<row 7>> will run the metadata extraction.
-   As discussed in the previous example it needs at least a
+   The extraction method at <<line 7>> will run the metadata extraction.
+   The produced metadata will be written within the passed
    {{{./xref/org/deri/any23/writer/TripleHandler.html}TripleHandler}} instance.
 
-   The expected output is <UTF-8> encoded at <<row 8>> and is:
+   The {{{./xref/org/deri/any23/writer/TripleHandler.html}TripleHandler}} needs to be explicitly closed,
+   this is done safely in a <<finally>> block at <<line 8>>.
+
+   The expected output is <UTF-8> encoded at <<line 9>> and is:
 
 +----------------------------------------------------------------------------------------------
 <http://www.rentalinrome.com/semanticloft/semanticloft.htm> <http://purl.org/dc/terms/title>

Modified: incubator/any23/trunk/src/site/apt/getting-started.apt
URL: http://svn.apache.org/viewvc/incubator/any23/trunk/src/site/apt/getting-started.apt?rev=1229627&r1=1229626&r2=1229627&view=diff
==============================================================================
--- incubator/any23/trunk/src/site/apt/getting-started.apt (original)
+++ incubator/any23/trunk/src/site/apt/getting-started.apt Tue Jan 10 16:32:28 2012
@@ -2,7 +2,7 @@ Getting started with <<Any23>>
 
     <<Any23>> can be used:
 
-      * as a commandline tool from your preferred shell environment;
+      * via CLI (command line interface) from your preferred shell environment;
       * as a RESTful Webservice;
       * as a library.
 
@@ -18,7 +18,7 @@ Getting started with <<Any23>>
 
 * Use the <<Any23>> CLI
 
-   The command-line tools are provided by the <<any23-core>> module.
+   The command-line tools support is provided by the <<any23-core>> module.
 
    Once <<Any23>> has been correctly {{{./install.html}installed}}, if you want to use it as a commandline tool,
    use the shell scripts within the <<"any23-core/bin">> directory.
@@ -32,37 +32,37 @@ Getting started with <<Any23>>
 
 +-------------------------------------------
 any23-core/bin$ ./any23tools
-[...configuration data...]
 Usage: ToolRunner <utility> [options...]
-where <utility> one of:
-	Eval                                                              Utility for processing output log.
+ where <utility> is one of:
 	ExtractorDocumentation                Utility for obtaining documentation about metadata extractors.
 	MicrodataParser                     Commandline Tool for extracting Microdata from file/HTTP source.
+	MimeDetector                                                                MIME Type Detector Tool.
 	PluginVerifier                                           Utility for plugin management verification.
 	Rover                                                                       Any23 Command Line Tool.
 	Version                        Prints out the current library version and configuration information.
 	VocabPrinter                            Prints out the RDF Schema of the vocabularies used by Any23.
-
 +-------------------------------------------
 
-   The <any23tools> script detects a list of available utilities within the <<any23-core>> classpath
-   and allows to activate them.
-
-   Such utilities are:
+   The <any23tools> script detects a list of available utilities within the <<any23-core>> and <<plugins>>
+   classpath and allows to activate them.
 
-       * <<<Rover>>>: the RDF extraction tool.
+   The <any23-core> CLI tools are:
 
        * <<<ExtractorDocumentation>>>: a utility for obtaining useful information about extractors.
 
        * <<<MicrodataParser>>>:  commandline parser to extract specific Microdata content from a web page
-                        (local or remote) and produce a JSON output compliant with the Microdata
-                        specification ({{{http://www.w3.org/TR/microdata/}http://www.w3.org/TR/microdata/}}).
+         (local or remote) and produce a JSON output compliant with the Microdata
+         specification ({{{http://www.w3.org/TR/microdata/}http://www.w3.org/TR/microdata/}}).
+
+       * <<<MimeDetector>>>: detects the MIME Type for any HTTP / file / direct input resource.
+
+       * <<<PluginVerifier>>>: a utility for verifying <Any23> plugins.
+
+       * <<<Rover>>>: the RDF extraction tool.
 
        * <<<Version>>>: prints out useful information about the library version and configuration.
 
        * <<<VocabPrinter>>>: allows to dump all the <<RDFSchema>> vocabularies declared within Any23.
-       
-       * <<<Eval>>>: commandline utility for processing Any23 generated output logs.
 
 ** Rover
    
@@ -72,21 +72,26 @@ where <utility> one of:
   
 +-------------------------------------------
 any23-core/bin$ any23tools Rover
-[...configuration data...]
-usage: {<url>|<file>} [-e <arg>] [-f <arg>] [-l <arg>] [-n] [-o <arg>]
-       [-p] [-s] [-t] [-v]
- -e <arg>                   comma-separated list of extractors, e.g.
-                            rdf-xml,rdf-turtle
- -f,--Output format <arg>   [turtle (default), ntriples, rdfxml, quad,
-                            uris]
- -l,--log <arg>             logging, please specify a file
- -n,--nesting               disable production of nesting triples
- -o,--output <arg>          ouput file (defaults to stdout)
- -p,--pedantic              validates and fixes HTML content detecting
-                            commons issues
- -s,--stats                 print out statistics of Any23
- -t,--notrivial             filter trivial statements
- -v,--verbose               show progress and debug information
+usage: [{<url>|<file>}]+ [-d <arg>] [-e <arg>] [-f <arg>] [-h] [-l <arg>]
+       [-n] [-o <arg>] [-p] [-s] [-t] [-v]
+ -d,--defaultns <arg>       Override the default namespace used to produce
+                            statements.
+ -e <arg>                   Specify a comma-separated list of extractors,
+                            e.g. rdf-xml,rdf-turtle.
+ -f,--Output format <arg>   [turtle (default), rdfxml, ntriples, nquads,
+                            trix, json, uri]
+ -h,--help                  Print this help.
+ -l,--log <arg>             Produce log within a file.
+ -n,--nesting               Disable production of nesting triples.
+ -o,--output <arg>          Specify Output file (defaults to standard
+                            output).
+ -p,--pedantic              Validate and fixes HTML content detecting
+                            commons issues.
+ -s,--stats                 Print out extraction statistics.
+ -t,--notrivial             Filter trivial statements (e.g. CSS related
+                            ones).
+ -v,--verbose               Show debug and progress information.
+Expected at least 1 argument.
 +-------------------------------------------
 
   Extract metadata from an <<HTML>> page:
@@ -126,7 +131,6 @@ any23-core/bin$ ./any23tools Rover -t -f
 
 +-------------------------------------------
 any23-core/bin$ ./any23tools ExtractorDocumentation
-[...configuration data...]
 Usage:
   ExtractorDocumentation -list
       shows the names of all available extractors
@@ -145,29 +149,29 @@ Usage:
 
 +--------------------------------------
 any23-core/bin$ ./any23tools ExtractorDocumentation -list
-[...configuration data...]
-csv
-html-head-icbm
-html-head-links
-html-head-title
-html-mf-adr
-html-mf-geo
-html-mf-hcalendar
-html-mf-hcard
-html-mf-hlisting
-html-mf-hrecipe
-html-mf-hresume
-html-mf-hreview
-html-mf-license
-html-mf-species
-html-mf-xfn
-html-microdata
-html-rdfa
-html-script-turtle
-rdf-nq
-rdf-nt
-rdf-turtle
-rdf-xml
+                      csv [class org.deri.any23.extractor.csv.CSVExtractor]
+           html-head-icbm [class org.deri.any23.extractor.html.ICBMExtractor]
+          html-head-links [class org.deri.any23.extractor.html.HeadLinkExtractor]
+          html-head-title [class org.deri.any23.extractor.html.TitleExtractor]
+              html-mf-adr [class org.deri.any23.extractor.html.AdrExtractor]
+              html-mf-geo [class org.deri.any23.extractor.html.GeoExtractor]
+        html-mf-hcalendar [class org.deri.any23.extractor.html.HCalendarExtractor]
+            html-mf-hcard [class org.deri.any23.extractor.html.HCardExtractor]
+         html-mf-hlisting [class org.deri.any23.extractor.html.HListingExtractor]
+          html-mf-hrecipe [class org.deri.any23.extractor.html.HRecipeExtractor]
+          html-mf-hresume [class org.deri.any23.extractor.html.HResumeExtractor]
+          html-mf-hreview [class org.deri.any23.extractor.html.HReviewExtractor]
+          html-mf-license [class org.deri.any23.extractor.html.LicenseExtractor]
+          html-mf-species [class org.deri.any23.extractor.html.SpeciesExtractor]
+              html-mf-xfn [class org.deri.any23.extractor.html.XFNExtractor]
+           html-microdata [class org.deri.any23.extractor.microdata.MicrodataExtractor]
+              html-rdfa11 [class org.deri.any23.extractor.rdfa.RDFa11Extractor]
+       html-script-turtle [class org.deri.any23.extractor.html.TurtleHTMLExtractor]
+                   rdf-nq [class org.deri.any23.extractor.rdf.NQuadsExtractor]
+                   rdf-nt [class org.deri.any23.extractor.rdf.NTriplesExtractor]
+                 rdf-trix [class org.deri.any23.extractor.rdf.TriXExtractor]
+               rdf-turtle [class org.deri.any23.extractor.rdf.TurtleExtractor]
+                  rdf-xml [class org.deri.any23.extractor.rdf.RDFXMLExtractor]
 +--------------------------------------
 
 ** MicrodataParser
@@ -177,18 +181,96 @@ rdf-xml
    declared in the Microdata specification section {{{http://www.w3.org/TR/microdata/#json}JSON}}.
 
 +--------------------------------------
-bin/any23tools MicrodataParser
+any23-core/bin$ ./any23tools MicrodataParser
 Usage: {http://path/to/resource.html|file:/path/to/local.file}
 +--------------------------------------
 
 
 ** VocabPrinter
 
-   The VocabPrinter prints out the RDFSchema declared by all the <<Any23>>
+   The VocabPrinter Tool prints out the RDFSchema declared by all the <<Any23>>
    declared vocabularies.
 
    <<This tool is still in beta version.>>
 
+** MimeDetector
+
+   The MimeDetector Tool extracts the <<MIME Type>> for a given source (http:// file:// inline://).
+
+   Examples:
+
++--------------------------------------
+any23-core/bin$ ./any23tools MimeDetector http://www.michelemostarda.com/foaf.rdf
+application/rdf+xml
+
+any23-core/bin$ ./any23tools MimeDetector file://../src/test/resources/application/trix/test1.trx
+application/trix
+
+any23-core/bin$ ./any23tools MimeDetector 'inline://<http://s> <http://p> <http://o> .'
+text/n3
++--------------------------------------
+
+** PluginVerifier
+
+   TODO: missing.
+
+* <<Any23>> CLI <Plugins>
+
+   The <<Any23>> ToolRunner CLI (<bin/any23tools>) supports the auto detection of Tool plugins within the classpath.
+   For further details see {{{./any23-plugins.html}Plugins}} section.
+
+   The default <<any23>> CLI plugins are enlisted below.
+
+** Crawler Plugin
+
+   {crawler-tool}
+   The <Crawler Plugin> provides basic site crawling and metadata extraction capabilities.
+
++----------------------------------------------------------------------------
+any23-core/bin$ ./any23tools Crawler
+usage: [{<url>|<file>}]+ [-d <arg>] [-e <arg>] [-f <arg>] [-h] [-l <arg>]
+       [-maxdepth <arg>] [-maxpages <arg>] [-n] [-numcrawlers <arg>] [-o
+       <arg>] [-p] [-pagefilter <arg>] [-politenessdelay <arg>] [-s]
+       [-storagefolder <arg>] [-t] [-v]
+ -d,--defaultns <arg>       Override the default namespace used to produce
+                            statements.
+ -e <arg>                   Specify a comma-separated list of extractors,
+                            e.g. rdf-xml,rdf-turtle.
+ -f,--Output format <arg>   [turtle (default), rdfxml, ntriples, nquads,
+                            trix, json, uri]
+ -h,--help                  Print this help.
+ -l,--log <arg>             Produce log within a file.
+ -maxdepth <arg>            Max allowed crawler depth. Default: no limit.
+ -maxpages <arg>            Max number of pages before interrupting crawl.
+                            Default: no limit.
+ -n,--nesting               Disable production of nesting triples.
+ -numcrawlers <arg>         Sets the number of crawlers. Default: 10
+ -o,--output <arg>          Specify Output file (defaults to standard
+                            output).
+ -p,--pedantic              Validate and fixes HTML content detecting
+                            commons issues.
+ -pagefilter <arg>          Regex used to filter out page URLs during
+                            crawling. Default:
+                            '.*(\.(css|js|bmp|gif|jpe?g|png|tiff?|mid|mp2|
+                            mp3|mp4|wav|wma|avi|mov|mpeg|ram|m4v|wmv|rm|sm
+                            il|pdf|swf|zip|rar|gz|xml|txt))$'
+ -politenessdelay <arg>     Politeness delay in milliseconds. Default: no
+                            limit.
+ -s,--stats                 Print out extraction statistics.
+ -storagefolder <arg>       Folder used to store crawler temporary data.
+                            Default:
+                            [/var/folders/d5/c_0b4h1d7t1gx6tzz_dn5cj40000g
+                            q/T/]
+ -t,--notrivial             Filter trivial statements (e.g. CSS related
+                            ones).
+ -v,--verbose               Show debug and progress information.
++----------------------------------------------------------------------------
+
+    A usage example:
+
++----------------------------------------------------------------------------
+any23-core/bin$ ./any23tools Crawler -s -f ntriples http://www.repubblica.it 1> out.nt 2> repubblica.log
++----------------------------------------------------------------------------
 
 * Use <<Any23>> as a RESTful Web Service
 

Modified: incubator/any23/trunk/src/site/apt/plugin-html-scraper.apt
URL: http://svn.apache.org/viewvc/incubator/any23/trunk/src/site/apt/plugin-html-scraper.apt?rev=1229627&r1=1229626&r2=1229627&view=diff
==============================================================================
--- incubator/any23/trunk/src/site/apt/plugin-html-scraper.apt (original)
+++ incubator/any23/trunk/src/site/apt/plugin-html-scraper.apt Tue Jan 10 16:32:28 2012
@@ -11,4 +11,4 @@ HTML Scraper Plugin
 +-----------------
 
   The plugin engine is based on the {{{http://code.google.com/p/boilerpipe/} Boilerpipe}} library extractor.
-  The extractors mentioned as DE, AE, LCE and CE are the ones defined within the library.
\ No newline at end of file
+  The extractors mentioned as <<DE>>, <<AE>>, <<LCE>> and <<CE>> are the ones defined within the library.
\ No newline at end of file

Added: incubator/any23/trunk/src/site/apt/plugin-office-scraper.apt
URL: http://svn.apache.org/viewvc/incubator/any23/trunk/src/site/apt/plugin-office-scraper.apt?rev=1229627&view=auto
==============================================================================
--- incubator/any23/trunk/src/site/apt/plugin-office-scraper.apt (added)
+++ incubator/any23/trunk/src/site/apt/plugin-office-scraper.apt Tue Jan 10 16:32:28 2012
@@ -0,0 +1,12 @@
+Office Scraper Plugins
+
+ * <Excel Plugin>
+
+   The {{{./xref/org/deri/any23/plugin/officescraper/ExcelPlugin.html}ExcelPlugin}} converts any
+   <<Microsoft Excel>> <97-2007> document to <RDF>.
+
+   <<TODO: add conversion schema.>>
+
+ * <Word Plugin>
+
+   <<NOTE: Under development.>>
\ No newline at end of file

Modified: incubator/any23/trunk/src/site/apt/service.apt
URL: http://svn.apache.org/viewvc/incubator/any23/trunk/src/site/apt/service.apt?rev=1229627&r1=1229626&r2=1229627&view=diff
==============================================================================
--- incubator/any23/trunk/src/site/apt/service.apt (original)
+++ incubator/any23/trunk/src/site/apt/service.apt Tue Jan 10 16:32:28 2012
@@ -65,13 +65,19 @@ Content-Length: 174
 
 * Form-style POST API
 
-  A document body can also be converted by HTTP POSTing form data to http://any23.org/. The Content-Type HTTP header must be set to application/x-www-form-urlencoded. The following parameters are supported:
-
-+---------------------------------------------------
-type	Media type of the input, see the table above. If not present, auto-detection will be attempted.
-body	Document body to be converted
-format	Desired output format; defaults to best
-+---------------------------------------------------
+  A document body can also be converted by HTTP POSTing form data to http://any23.org/.
+  The Content-Type HTTP header must be set to <application/x-www-form-urlencoded>.
+  The following parameters are supported:
+
+*----------+------------------------------------------------------------------------------------------------------------+
+|type	   |Media type of the input, see the table above. If not present, auto-detection will be attempted.             |
+*----------+------------------------------------------------------------------------------------------------------------+
+|body	   |Document body to be converted.                                                                              |
+*----------+------------------------------------------------------------------------------------------------------------+
+|format	   |Desired output format; defaults to <<best>>.                                                                |
+*----------+------------------------------------------------------------------------------------------------------------+
+|validation|The validation level to be applied, supported values: <<none>> (default), <<validate>> and <<validate-fix>>.|              |                                                                                 |
+*----------+------------------------------------------------------------------------------------------------------------+
 
 * Output Formats
 
@@ -93,14 +99,16 @@ Code	                    Reason
 502 Bad Gateway	            Input document from a remote server could not be fetched or parsed.
 +---------------------------------------------------
 
-* Report Format
+* XML Report Format
+
+    {report-format}
 
     The <Any23 Service> can optionally return an XML report and attempt error fix if
     the flags <fix> and <report> are activated ( <fix=on&report=on> ).
     The following URL shows how to use these flags.
 
 +---------------------------------------------------
-http://any23.org/any23-service/any23/?format=best&uri=http%3A%2F%2Fpath%2Fto%2Fresource&fix=on&report=on
+http://any23.org/any23-service/any23/?format=best&uri=http%3A%2F%2Fpath%2Fto%2Fresource&validation=none&report=on
 +---------------------------------------------------
 
     The <fix> functionality is described {{{./dev-validation-fix.html}here}}.
@@ -112,29 +120,37 @@ http://any23.org/any23-service/any23/?fo
     <response/report/error> section the error stack trace if available.
 
     The result of validation is contained within the <response/report/validationReport> node.
-    Whithin that node there is the list of the activated rules, the issues detected and
+    Within that node there is the list of the activated rules, the issues detected and
     the errors generated.
 
-+---------------------------------------------------
++-----------------------------------------------------------------------------------------------------------------
 <?xml version="1.0" encoding="UTF-8" ?>
 <response>
+    <!-- List of activated extractors. -->
     <extractors>
-        <extractor>extractor-a</extractor>
-        <extractor>extractor-b</extractor>
+        <extractor><!-- Extractor name. --></extractor>
+         <!-- ... -->
     </extractors>
     <report>
-        <message/>
-        <error> </error>
-        <validationReport>
-            <errors> </errors>
-            <issues> </issues>
-            <ruleActivations> </ruleActivations>
-        </validationReport>
-    </report>
+        <message></message>
+        <error></error>
+    <!-- Validation specific report, contains all errors and issues detected within the document. -->
+    <validationReport>
+        <!-- List of errors found while validating the document. -->
+        <errors>
+        </errors>
+        <!-- List of issues found while validating the document. -->
+        <issues>
+        </issues>
+        <!-- List of rules activated to solve the detected issues. -->
+        <ruleActivations>
+        </ruleActivations>
+    </validationReport>
+</report>
     <data>
-        <![CDATA[
-        <http://sub> <http://pred> <http://obj> <http://graph> .
-        ]]>
+<![CDATA[
+ -- Actual Data in the format specified as output. --
+]]>
     </data>
 </response>
-+---------------------------------------------------
\ No newline at end of file
++-----------------------------------------------------------------------------------------------------------------
\ No newline at end of file

Modified: incubator/any23/trunk/src/site/apt/supported-formats.apt
URL: http://svn.apache.org/viewvc/incubator/any23/trunk/src/site/apt/supported-formats.apt?rev=1229627&r1=1229626&r2=1229627&view=diff
==============================================================================
--- incubator/any23/trunk/src/site/apt/supported-formats.apt (original)
+++ incubator/any23/trunk/src/site/apt/supported-formats.apt Tue Jan 10 16:32:28 2012
@@ -34,9 +34,16 @@ Supported Formats in Any23
 
    * <<RDF/XML>> <<Any23>> is able to produce output in {{{http://www.w3.org/TR/rdf-syntax-grammar/}RDF/XML}}.
 
-   * <<JSON>> <<Any23>> is able to produce output in {{{http://www.json.org/}JSON}} .
+   * <<JSON Statements>> <<Any23>> is able to produce output in {{{http://www.json.org/}JSON}} . See the specific {{{json-statements}format}}.
+
+   * <<XML Report>> <<Any23>> is able to produce a detailed report of the latest document extraction if required. See further details {{{./service.html#report-format}here}}.
+
+* JSON Statements Format
+
+  {json-statements}
+
+     Any23 is able to produce JSON output following the format described below.
 
-     In particular we choose the format described below.
      Given the following example statements (expressed in N-Quads format):
 
 +-------------------------------------------------------------------------------
@@ -97,7 +104,7 @@ Supported Formats in Any23
     The <<JSON object>> structure is described by the following <<BNF>> rules,
     where quotes are omitted to improve readability:
 
-+-----------------------------------------------------------------------------------------------------------------
++-------------------------------------------------------------------------------
  <json-response> ::= { "quads" : <statements> }
  <statements>    ::= [ <statement>+ ]
  <statement>     ::= [ <subject> , <predicate> , <object> , <graph> ]
@@ -111,4 +118,4 @@ Supported Formats in Any23
  <lang>          ::= String | null
  <datatype>      ::= <uri>  | null
  <uri>           ::= String
-+-----------------------------------------------------------------------------------------------------------------
++-------------------------------------------------------------------------------