You are viewing a plain text version of this content. The canonical link for it is here.

Posted to droids-dev@incubator.apache.org by "Lochschmied, Alexander" <Al...@vishay.com> on 2012/09/06 15:03:47 UTC

Production ready for indexing in Solr?

Hi,

we are looking for a simple but fast Java framework to crawl all pages of our website. Ultimate goal is to index certain parts of the page in a Solr search system.

The crawler must obviously read the HTML. Is it possible to get the page content from Droids without reading the HTTP stream again?
I am not sure if below test code is ok, but it seemed to return same URLs twice. That's why I tried to fix that in the "MyCrawlingDroid" class, but I assume that's the wrong place anyway. Different HTTP GET URL parameters in links found should be treated as different links. I saw Droids may have a problem with that? https://issues.apache.org/jira/browse/DROIDS-144

The reason why we do not yet "simply" use Nutch is that we already have Java code to index other data sources in Solr. So it would be nice to be able to integrate a Crawler framework in this code and reuse our other processing/indexing logic.

I have not found good examples that fit in our (Guice based) system. Can you recommend something or is there not much point in trying to use Droids for such a system (yet)?

Thanks,
Alexander

import java.net.URI;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.Queue;
import java.util.Set;
import java.util.concurrent.TimeUnit;

import org.apache.droids.api.ContentEntity;
import org.apache.droids.api.Link;
import org.apache.droids.api.TaskMaster;
import org.apache.droids.api.Worker;
import org.apache.droids.delay.SimpleDelayTimer;
import org.apache.droids.exception.DroidsException;
import org.apache.droids.handle.SysoutHandler;
import org.apache.droids.helper.factories.DroidFactory;
import org.apache.droids.helper.factories.HandlerFactory;
import org.apache.droids.helper.factories.ParserFactory;
import org.apache.droids.helper.factories.ProtocolFactory;
import org.apache.droids.helper.factories.URLFiltersFactory;
import org.apache.droids.impl.DefaultTaskExceptionHandler;
import org.apache.droids.impl.SequentialTaskMaster;
import org.apache.droids.net.RegexURLFilter;
import org.apache.droids.parse.html.HtmlParser;
import org.apache.droids.protocol.http.DroidsHttpClient;
import org.apache.droids.protocol.http.HttpProtocol;
import org.apache.droids.robot.crawler.CrawlingDroid;
import org.apache.droids.robot.crawler.CrawlingWorker;
import org.apache.http.HttpVersion;
import org.apache.http.conn.params.ConnManagerParamBean;
import org.apache.http.conn.params.ConnPerRouteBean;
import org.apache.http.params.BasicHttpParams;
import org.apache.http.params.HttpConnectionParamBean;
import org.apache.http.params.HttpParams;
import org.apache.http.params.HttpProtocolParamBean;
import org.apache.http.protocol.HTTP;

public class VishayIndexerCrawler {

    public static void main(String[] args) throws Exception {

//        if (args.length < 1) {
//            System.out.println("Please specify a URL to crawl");
//            System.exit(-1);
//        }
        String targetURL = "http://www.vishay.com"; // args[0];

        // Create parser factory. Support basic HTML markup only
        ParserFactory parserFactory = new ParserFactory();
        HtmlParser htmlParser = new HtmlParser();
        htmlParser.setElements(new HashMap<String, String>());
        htmlParser.getElements().put("a", "href");
        htmlParser.getElements().put("link", "href");
//        htmlParser.getElements().put("img", "src");
//        htmlParser.getElements().put("script", "src");
        parserFactory.getMap().put("text/html", htmlParser);

        // Create protocol factory. Support HTTP/S only.
        ProtocolFactory protocolFactory = new ProtocolFactory();

        // Create and configure HTTP client
        HttpParams params = new BasicHttpParams();
        HttpProtocolParamBean hppb = new HttpProtocolParamBean(params);
        HttpConnectionParamBean hcpb = new HttpConnectionParamBean(params);
        ConnManagerParamBean cmpb = new ConnManagerParamBean(params);

        // Set protocol parametes
        hppb.setVersion(HttpVersion.HTTP_1_1);
        hppb.setContentCharset(HTTP.ISO_8859_1);
        hppb.setUseExpectContinue(true);
        // Set connection parameters
        hcpb.setStaleCheckingEnabled(false);
        // Set connection manager parameters
        ConnPerRouteBean connPerRouteBean = new ConnPerRouteBean();
        connPerRouteBean.setDefaultMaxPerRoute(2);
        cmpb.setConnectionsPerRoute(connPerRouteBean);

        DroidsHttpClient httpclient = new DroidsHttpClient(params);

        HttpProtocol httpProtocol = new HttpProtocol(httpclient);
        protocolFactory.getMap().put("http", httpProtocol);
        protocolFactory.getMap().put("https", httpProtocol);

        // Create URL filter factory.
        URLFiltersFactory filtersFactory = new URLFiltersFactory();
        RegexURLFilter defaultURLFilter = new RegexURLFilter();
        defaultURLFilter.setFile("classpath:/regex-urlfilter.txt");
        filtersFactory.getMap().put("default", defaultURLFilter);

        // Create handler factory. Provide sysout handler only.
        HandlerFactory handlerFactory = new HandlerFactory();
        SysoutHandler defaultHandler = new SysoutHandler();
        handlerFactory.getMap().put("default", defaultHandler);

        // Create droid factory. Leave it empty for now.
        DroidFactory<Link> droidFactory = new DroidFactory<Link>();

        // Create default droid
        SimpleDelayTimer simpleDelayTimer = new SimpleDelayTimer();
        simpleDelayTimer.setDelayMillis(100);

        Queue<Link> simpleQueue = new LinkedList<Link>();

        SequentialTaskMaster<Link> taskMaster = new SequentialTaskMaster<Link>();
        taskMaster.setDelayTimer(simpleDelayTimer);
        taskMaster.setExceptionHandler(new DefaultTaskExceptionHandler());

        CrawlingDroid helloCrawler = new MyCrawlingDroid(simpleQueue, taskMaster);
        helloCrawler.setFiltersFactory(filtersFactory);
        helloCrawler.setParserFactory(parserFactory);
        helloCrawler.setProtocolFactory(protocolFactory);

        Collection<String> initialLocations = new ArrayList<String>();
        initialLocations.add(targetURL);
        helloCrawler.setInitialLocations(initialLocations);

        // Initialize and start the crawler
        helloCrawler.init();
        helloCrawler.start();

        // Await termination
        helloCrawler.getTaskMaster().awaitTermination(0, TimeUnit.MILLISECONDS);
        // Shut down the HTTP connection manager
        httpclient.getConnectionManager().shutdown();
    }

    static class MyCrawlingDroid extends CrawlingDroid {
        final static Set<URI> visited = Collections.synchronizedSet(new HashSet<URI>());

        public MyCrawlingDroid(Queue<Link> queue, TaskMaster<Link> taskMaster) {
            super(queue, taskMaster);
        }

        @Override
        public Worker<Link> getNewWorker() {
            final CrawlingWorker worker = new CrawlingWorker(this);
            // worker.setHandlerFactory(DroidsFactory.createDefaultHandlerFactory(new SysoutHandler()));
            HandlerFactory hf = new HandlerFactory() {
                public boolean handle(URI uri, ContentEntity entity) throws DroidsException {
                    if (visited.contains(uri)) {
                        return true;
                    }
//                    entity.getParse().getOutlinks().contains(new Link()) {
//                        System.err.println("THERE IS ONE: " + uri);
//                    }
                    System.out.println(uri);
                    visited.add(uri);
                    return true;
                }
            };
            worker.setHandlerFactory(hf);
            return worker;
        }
    }
}

Re: Production ready for indexing in Solr?

Posted by Richard Frovarp <rf...@apache.org>.

Yes is it production ready. I'm using it just fine.

Depending on your needs, it can do a lot more than Nutch, as you're 
writing all of the code to control it, so you can also persist off to a 
database or anything you really want.

I am able to crawl our sites in an incremental manner (checking ETag and 
Last-Modifieds out of a database) in about an hour. I could probably go 
faster, but I think I might kill our web server or database server.

DROIDS-144 is for a filter that you can use. By default it isn't used. 
In my implementation, I don't use it, so it doesn't affect me. We should 
deal with the bug.

I do not use the droids-solr module. I just include SolrJ and do 
everything that way as my needs are very different from what the module 
does. I suspect you'll find the same, but as you already have 
experience, you shouldn't have any difficulty there.

What do you mean by "Is it possible to get the page content from Doirds 
without reading the HTTP stream again"? In your handle function you just 
get the content out of the entity, by calling getParse(). You probably 
want to look at using the TikaDocumentParser instead of HtmlParser, as 
it has a bit better handling. You then get back a TikaParse from 
getParse(), which has all sorts of content parsings to retrieve.

With the same URL twice problem, I would change your queue. For 
simpleQueue, instead use a SimpleTaskQueueWithHistory from droids-core. 
That will automatically prevent you from double visiting a URL. You can 
then remove your logic for handling visited. That uses the full URI, so 
different GET parameters will constitute a different work task, so they 
will all be handled, for better or for worse. I'm guessing your site is 
well behaved, so it is what you want. The sites I crawl aren't as well 
behaved, so while it is desirable, I have to put a max depth in, 
otherwise it will go on forever.

If you want to be able to do logic before requesting the content, you'll 
need to add a work monitor.

As far as working with Guice, I have no experience with that, so I can't 
comment on that piece. I know that some on the list are a fan of it, so 
they might have ideas. And as always, contributions welcome. If there 
are changes we can make to make Guice easier, we certainly can look at 
them, we just need to be told what they are, and obviously patches 
really help out if possible.

On 09/06/2012 10:03 AM, Lochschmied, Alexander wrote:
> Hi,
>
> we are looking for a simple but fast Java framework to crawl all pages of our website. Ultimate goal is to index certain parts of the page in a Solr search system.
>
> The crawler must obviously read the HTML. Is it possible to get the page content from Droids without reading the HTTP stream again?
> I am not sure if below test code is ok, but it seemed to return same URLs twice. That's why I tried to fix that in the "MyCrawlingDroid" class, but I assume that's the wrong place anyway. Different HTTP GET URL parameters in links found should be treated as different links. I saw Droids may have a problem with that? https://issues.apache.org/jira/browse/DROIDS-144
>
> The reason why we do not yet "simply" use Nutch is that we already have Java code to index other data sources in Solr. So it would be nice to be able to integrate a Crawler framework in this code and reuse our other processing/indexing logic.
>
> I have not found good examples that fit in our (Guice based) system. Can you recommend something or is there not much point in trying to use Droids for such a system (yet)?
>
> Thanks,
> Alexander
>
> import java.net.URI;
> import java.util.ArrayList;
> import java.util.Collection;
> import java.util.Collections;
> import java.util.HashMap;
> import java.util.HashSet;
> import java.util.LinkedList;
> import java.util.Queue;
> import java.util.Set;
> import java.util.concurrent.TimeUnit;
>
> import org.apache.droids.api.ContentEntity;
> import org.apache.droids.api.Link;
> import org.apache.droids.api.TaskMaster;
> import org.apache.droids.api.Worker;
> import org.apache.droids.delay.SimpleDelayTimer;
> import org.apache.droids.exception.DroidsException;
> import org.apache.droids.handle.SysoutHandler;
> import org.apache.droids.helper.factories.DroidFactory;
> import org.apache.droids.helper.factories.HandlerFactory;
> import org.apache.droids.helper.factories.ParserFactory;
> import org.apache.droids.helper.factories.ProtocolFactory;
> import org.apache.droids.helper.factories.URLFiltersFactory;
> import org.apache.droids.impl.DefaultTaskExceptionHandler;
> import org.apache.droids.impl.SequentialTaskMaster;
> import org.apache.droids.net.RegexURLFilter;
> import org.apache.droids.parse.html.HtmlParser;
> import org.apache.droids.protocol.http.DroidsHttpClient;
> import org.apache.droids.protocol.http.HttpProtocol;
> import org.apache.droids.robot.crawler.CrawlingDroid;
> import org.apache.droids.robot.crawler.CrawlingWorker;
> import org.apache.http.HttpVersion;
> import org.apache.http.conn.params.ConnManagerParamBean;
> import org.apache.http.conn.params.ConnPerRouteBean;
> import org.apache.http.params.BasicHttpParams;
> import org.apache.http.params.HttpConnectionParamBean;
> import org.apache.http.params.HttpParams;
> import org.apache.http.params.HttpProtocolParamBean;
> import org.apache.http.protocol.HTTP;
>
> public class VishayIndexerCrawler {
>
>      public static void main(String[] args) throws Exception {
>
> //        if (args.length < 1) {
> //            System.out.println("Please specify a URL to crawl");
> //            System.exit(-1);
> //        }
>          String targetURL = "http://www.vishay.com"; // args[0];
>
>          // Create parser factory. Support basic HTML markup only
>          ParserFactory parserFactory = new ParserFactory();
>          HtmlParser htmlParser = new HtmlParser();
>          htmlParser.setElements(new HashMap<String, String>());
>          htmlParser.getElements().put("a", "href");
>          htmlParser.getElements().put("link", "href");
> //        htmlParser.getElements().put("img", "src");
> //        htmlParser.getElements().put("script", "src");
>          parserFactory.getMap().put("text/html", htmlParser);
>
>          // Create protocol factory. Support HTTP/S only.
>          ProtocolFactory protocolFactory = new ProtocolFactory();
>
>          // Create and configure HTTP client
>          HttpParams params = new BasicHttpParams();
>          HttpProtocolParamBean hppb = new HttpProtocolParamBean(params);
>          HttpConnectionParamBean hcpb = new HttpConnectionParamBean(params);
>          ConnManagerParamBean cmpb = new ConnManagerParamBean(params);
>
>          // Set protocol parametes
>          hppb.setVersion(HttpVersion.HTTP_1_1);
>          hppb.setContentCharset(HTTP.ISO_8859_1);
>          hppb.setUseExpectContinue(true);
>          // Set connection parameters
>          hcpb.setStaleCheckingEnabled(false);
>          // Set connection manager parameters
>          ConnPerRouteBean connPerRouteBean = new ConnPerRouteBean();
>          connPerRouteBean.setDefaultMaxPerRoute(2);
>          cmpb.setConnectionsPerRoute(connPerRouteBean);
>
>          DroidsHttpClient httpclient = new DroidsHttpClient(params);
>
>          HttpProtocol httpProtocol = new HttpProtocol(httpclient);
>          protocolFactory.getMap().put("http", httpProtocol);
>          protocolFactory.getMap().put("https", httpProtocol);
>
>          // Create URL filter factory.
>          URLFiltersFactory filtersFactory = new URLFiltersFactory();
>          RegexURLFilter defaultURLFilter = new RegexURLFilter();
>          defaultURLFilter.setFile("classpath:/regex-urlfilter.txt");
>          filtersFactory.getMap().put("default", defaultURLFilter);
>
>          // Create handler factory. Provide sysout handler only.
>          HandlerFactory handlerFactory = new HandlerFactory();
>          SysoutHandler defaultHandler = new SysoutHandler();
>          handlerFactory.getMap().put("default", defaultHandler);
>
>          // Create droid factory. Leave it empty for now.
>          DroidFactory<Link> droidFactory = new DroidFactory<Link>();
>
>          // Create default droid
>          SimpleDelayTimer simpleDelayTimer = new SimpleDelayTimer();
>          simpleDelayTimer.setDelayMillis(100);
>
>          Queue<Link> simpleQueue = new LinkedList<Link>();
>
>          SequentialTaskMaster<Link> taskMaster = new SequentialTaskMaster<Link>();
>          taskMaster.setDelayTimer(simpleDelayTimer);
>          taskMaster.setExceptionHandler(new DefaultTaskExceptionHandler());
>
>          CrawlingDroid helloCrawler = new MyCrawlingDroid(simpleQueue, taskMaster);
>          helloCrawler.setFiltersFactory(filtersFactory);
>          helloCrawler.setParserFactory(parserFactory);
>          helloCrawler.setProtocolFactory(protocolFactory);
>
>          Collection<String> initialLocations = new ArrayList<String>();
>          initialLocations.add(targetURL);
>          helloCrawler.setInitialLocations(initialLocations);
>
>          // Initialize and start the crawler
>          helloCrawler.init();
>          helloCrawler.start();
>
>          // Await termination
>          helloCrawler.getTaskMaster().awaitTermination(0, TimeUnit.MILLISECONDS);
>          // Shut down the HTTP connection manager
>          httpclient.getConnectionManager().shutdown();
>      }
>
>      static class MyCrawlingDroid extends CrawlingDroid {
>          final static Set<URI> visited = Collections.synchronizedSet(new HashSet<URI>());
>
>          public MyCrawlingDroid(Queue<Link> queue, TaskMaster<Link> taskMaster) {
>              super(queue, taskMaster);
>          }
>
>          @Override
>          public Worker<Link> getNewWorker() {
>              final CrawlingWorker worker = new CrawlingWorker(this);
>              // worker.setHandlerFactory(DroidsFactory.createDefaultHandlerFactory(new SysoutHandler()));
>              HandlerFactory hf = new HandlerFactory() {
>                  public boolean handle(URI uri, ContentEntity entity) throws DroidsException {
>                      if (visited.contains(uri)) {
>                          return true;
>                      }
> //                    entity.getParse().getOutlinks().contains(new Link()) {
> //                        System.err.println("THERE IS ONE: " + uri);
> //                    }
>                      System.out.println(uri);
>                      visited.add(uri);
>                      return true;
>                  }
>              };
>              worker.setHandlerFactory(hf);
>              return worker;
>          }
>      }
> }
>
>