You are viewing a plain text version of this content. The canonical link for it is here.
Posted to droids-commits@incubator.apache.org by mi...@apache.org on 2009/09/01 22:11:33 UTC

svn commit: r810273 [2/4] - in /incubator/droids/trunk/droids-crawler: ./ docs/ docs/diagrams/ src/ src/main/ src/main/groovy/ src/main/java/ src/main/java/org/ src/main/java/org/apache/ src/main/java/org/apache/droids/ src/main/java/org/apache/droids/...

Added: incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/StandaloneCrawlerController.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/StandaloneCrawlerController.java?rev=810273&view=auto
==============================================================================
--- incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/StandaloneCrawlerController.java (added)
+++ incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/StandaloneCrawlerController.java Tue Sep  1 22:11:29 2009
@@ -0,0 +1,120 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.crawler;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.droids.core.Worker;
+import org.apache.droids.crawler.util.CrawlerExecutorService;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.context.ApplicationContext;
+
+import java.util.Queue;
+
+/**
+ * This controller works without droids-core in a standalone manner.
+ *
+ * @param <T>
+ */
+public class StandaloneCrawlerController<T extends Link> extends AbstractCrawlerController<T>{
+    @Autowired ApplicationContext context;
+    static Log log = LogFactory.getLog(StandaloneCrawlerController.class);
+    protected boolean started = false;
+    @Autowired(required = false) protected CrawlerExecutorService executorService;
+    protected long tickleTime = 5000L;
+    protected long timeout = 30 * 1000L;
+    protected int threads = 1;
+
+    public boolean isStarted(){ return started; }
+
+    public void start() throws CrawlerException{
+        started = true;
+        if (log.isInfoEnabled())
+            log.info("start() - started: " + started + ", queue.size: " + queue.size() + ", queue: " + queue + ", executorService: " + executorService);
+        final StandaloneCrawlerController controller = this;
+
+        while (started){
+            if (!queue.isEmpty() && executorService.getActiveCount() < executorService.getMaximumPoolSize()){
+                Worker<T> worker = context.getBean("worker", Worker.class);
+                if (log.isInfoEnabled())
+                    log.info("to execute worker: " + worker + ", queue.isEmpty(): " + queue.isEmpty() + ", executors: " + executorService);
+                executorService.execute(worker);
+            }
+
+            long now = System.currentTimeMillis();
+            if (executorService.getActiveCount() == 0 && (now - executorService.getLastCompleted() >= timeout)){
+                //normal shutdown case that no task is running, and graceful period is passed
+                started = false;
+                if (log.isInfoEnabled())
+                    log.info("to stop - no active thread and graceful period is passed - now-lastCompleted: " + (now - executorService.getLastCompleted()) + ", timeout: " + timeout);
+            } else{
+                if (log.isDebugEnabled())
+                    log.debug("to sleep for " + tickleTime + "ms - now-lastCompleted: " + (now - executorService.getLastCompleted()) + ", timeout: " + timeout);
+                try{
+                    Thread.sleep(tickleTime);
+                } catch (InterruptedException e){
+                    throw new CrawlerException(e.getMessage(), e);
+                }
+            }
+
+            //TODO add other condition including forceful termination
+            //TODO handle thread timeout. if a thread just run forever, this
+        }
+
+        if (log.isInfoEnabled()){
+            String fmt1 = "%1$-20s";
+            log.info(String.format(fmt1, "seeds: ") + seeds + ", seeds.size(): " + ((seeds != null) ? seeds.size() : 0));
+            log.info(String.format(fmt1, "queue: ") + queue + ", queue.size(): " + ((queue != null) ? queue.size() : 0));
+            log.info(String.format(fmt1, "filters: ") + filters);
+            log.info(String.format(fmt1, "crawlerService: ") + crawlerService);
+        }
+
+        started = false;
+    }
+
+    public CrawlerExecutorService getExecutorService(){
+        return executorService;
+    }
+
+    public void setExecutorService(CrawlerExecutorService executorService){
+        this.executorService = executorService;
+    }
+
+    public long getTickleTime(){
+        return tickleTime;
+    }
+
+    public void setTickleTime(long tickleTime){
+        this.tickleTime = tickleTime;
+    }
+
+    public long getTimeout(){
+        return timeout;
+    }
+
+    public void setTimeout(long timeout){
+        this.timeout = timeout;
+    }
+
+    public int getThreads(){
+        return threads;
+    }
+
+    public void setThreads(int threads){
+        this.threads = threads;
+    }
+}

Added: incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/extractor/AbstractExtractor.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/extractor/AbstractExtractor.java?rev=810273&view=auto
==============================================================================
--- incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/extractor/AbstractExtractor.java (added)
+++ incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/extractor/AbstractExtractor.java Tue Sep  1 22:11:29 2009
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.crawler.extractor;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.droids.crawler.Link;
+import org.apache.droids.crawler.CrawlerService;
+import org.apache.droids.crawler.parser.Parser;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.beans.factory.annotation.Qualifier;
+
+import javax.annotation.PostConstruct;
+import java.util.*;
+
+public abstract class AbstractExtractor<T extends Link, P extends Parser> implements Extractor<T, P>{
+    protected static Log log = LogFactory.getLog(AbstractExtractor.class);
+    public static final long serialVersionUID = CrawlerService.DEFAULT_SERIALVERSIONID;
+    @Autowired(required = false) protected List<ExtractFilter<T>> filters;
+    @Autowired(required = false) @Qualifier("extractor.filterComparator") protected Comparator filterComparator;
+
+    abstract public Set<T> extract(T base, P parser);
+
+    abstract public boolean matches(T link);
+
+    @PostConstruct Extractor<T, P> init(){
+        if (filters != null && filterComparator != null) Collections.sort(filters, filterComparator);
+        return this;
+    }
+
+    protected Set<T> extracted(T link, Set<T> result){
+        if (hasFilter()){
+            for (ExtractFilter<T> filter : getFilters()){
+                result = filter.extracted(link, this, result);
+                if (log.isTraceEnabled())
+                    log.trace("filtered - filter: " + filter + ", result.size(): " + (result != null ? result.size() : 0));
+            }
+        }
+        link.setState(Link.State.EXTRACTED);
+        return result;
+    }
+
+    public boolean addFilter(ExtractFilter<T> filter){
+        if (filters == null) filters = new ArrayList<ExtractFilter<T>>();
+        return filters.add(filter);
+    }
+
+    public boolean removeFilter(ExtractFilter<T> filter){
+        return filters.remove(filter);
+    }
+
+    public void setFilters(List<ExtractFilter<T>> filters){
+        this.filters = filters;
+    }
+
+    public List<ExtractFilter<T>> getFilters(){
+        return filters;
+    }
+
+    public boolean hasFilter(){
+        return filters != null && filters.size() > 0;
+    }
+
+    public Comparator getFilterComparator(){
+        return filterComparator;
+    }
+
+    public void setFilterComparator(Comparator filterComparator){
+        this.filterComparator = filterComparator;
+    }
+}

Added: incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/extractor/ExtractFilter.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/extractor/ExtractFilter.java?rev=810273&view=auto
==============================================================================
--- incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/extractor/ExtractFilter.java (added)
+++ incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/extractor/ExtractFilter.java Tue Sep  1 22:11:29 2009
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.crawler.extractor;
+
+import org.apache.droids.crawler.Link;
+import org.apache.droids.crawler.filter.Filter;
+import org.apache.droids.crawler.parser.Parser;
+
+import java.util.List;
+import java.util.Set;
+
+public interface ExtractFilter<T extends Link> extends Filter<T>{
+
+    /**
+     * After all out-links are extracted by each extractors
+     */
+    Set<T> extracted(T base, Extractor<T, ? extends Parser> extractor, Set<T> links);
+
+    /**
+     * After all out-links are extracted by all extractors
+     */
+    Set<T> extractedAll(T base, List<Extractor<T, ? extends Parser>> extractors, Set<T> links);
+}

Added: incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/extractor/Extractor.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/extractor/Extractor.java?rev=810273&view=auto
==============================================================================
--- incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/extractor/Extractor.java (added)
+++ incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/extractor/Extractor.java Tue Sep  1 22:11:29 2009
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.crawler.extractor;
+
+import org.apache.droids.crawler.Link;
+import org.apache.droids.crawler.filter.FilterSupport;
+import org.apache.droids.crawler.util.LinkMatcher;
+import org.apache.droids.crawler.parser.Parser;
+
+import java.io.Serializable;
+import java.util.Set;
+
+/**
+ * Third of the 3-step crawling process after a link is polled from the Queue ( fetch -> parse -> extract )
+ * <p/>
+ * An Extractor extracts data from a Parser. It's primary purpose is to extract outlinks. And it is also
+ * designed for extracting any user data to store to a Link. By design, an extractor has to associate with a parser.
+ * <p/>
+ * Every link will be fetched and parsed once but may be extracted for multiple times by any number of matched
+ * extractor. The result will be aggregated.
+ *
+ * @param <T>
+ */
+public interface Extractor<T extends Link, P extends Parser> extends LinkMatcher<T>, FilterSupport<ExtractFilter<T>>, Serializable{
+
+    Set<T> extract(T base, P parser);
+
+}

Added: incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/extractor/ExtractorException.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/extractor/ExtractorException.java?rev=810273&view=auto
==============================================================================
--- incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/extractor/ExtractorException.java (added)
+++ incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/extractor/ExtractorException.java Tue Sep  1 22:11:29 2009
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.crawler.extractor;
+
+import org.apache.droids.crawler.CrawlerException;
+
+public class ExtractorException extends CrawlerException{
+    public ExtractorException(){
+    }
+
+    public ExtractorException(String message){
+        super(message);
+    }
+
+    public ExtractorException(String message, Throwable cause){
+        super(message, cause);
+    }
+
+    public ExtractorException(Throwable cause){
+        super(cause);
+    }
+}

Added: incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/extractor/HtmlElementLinkExtractor.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/extractor/HtmlElementLinkExtractor.java?rev=810273&view=auto
==============================================================================
--- incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/extractor/HtmlElementLinkExtractor.java (added)
+++ incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/extractor/HtmlElementLinkExtractor.java Tue Sep  1 22:11:29 2009
@@ -0,0 +1,117 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.crawler.extractor;
+
+import org.apache.droids.crawler.Link;
+import org.apache.droids.crawler.util.URIResolver;
+import org.apache.droids.crawler.parser.Parser;
+import org.apache.droids.crawler.parser.impl.NekoHtmlParser;
+import org.apache.droids.crawler.parser.impl.AbstractHierarchicalDataParser;
+import org.apache.droids.crawler.extractor.Extractor;
+
+import java.net.URI;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+import java.util.Collection;
+import java.util.regex.Pattern;
+import java.util.regex.Matcher;
+
+import org.apache.commons.logging.LogFactory;
+import org.apache.commons.logging.Log;
+
+/**
+ * This is a generic HTML link extractor that expects the parser returns data in a hierarchy as follows:
+ * [ ${tagName} : [ $attributeName: [ ${relativeLink1}, ${relativeLink2} ] as Set ]
+ * <p/>
+ * And it output a Set<Link>
+ * 
+ * TODO: include DROIDS-59 from LinkExtractor
+ */
+public class HtmlElementLinkExtractor<T extends Link> extends AbstractExtractor<T, Parser<T, ?>> implements Extractor<T, Parser<T, ?>>{
+    protected static Log log = LogFactory.getLog(HtmlElementLinkExtractor.class);
+    protected static String[] SUPPORTED_TAGS = new String[]{"A", "LINK", "IMG", "SCRIPT", "AREA", "STYLE"};
+
+    // reference: http://java.sun.com/javase/6/docs/api/java/util/regex/Pattern.html
+    static final Pattern JS_ANCHOR = Pattern.compile("^JAVASCRIPT.*", Pattern.CASE_INSENSITIVE);
+    static final Pattern CSS_BG = Pattern.compile("^.*background-image\\s*:\\s*url\\(\\s*['\"]?([^'\"\\)\\s]*)['\"]?\\s*\\).*", Pattern.CASE_INSENSITIVE);
+    //static final Pattern CSS_BG = Pattern.compile("background-image\\s*:\\s*url\\s*\\(\\s*['\"]?(.*)['\"]?\\s*\\)", Pattern.CASE_INSENSITIVE);
+
+
+    public boolean matches(T link){
+        if (link.containsKey("parsed") && !(link.get("parsed", Parser.class).getData() instanceof Map)){ // parser.data must be Map, if existed
+            return false;
+        }
+
+        if (link.containsKey("contentType")){
+            if ("text/html".equals(link.get("contentType"))) return true;
+        }
+        return false;
+    }
+
+    /**
+     * parser.data must be Map<String, Map<String, Collection<String>>>
+     */
+    public Set<T> extract(T base, Parser<T, ?> parser){
+        if (log.isTraceEnabled())
+            log.trace("extract() - base: " + base + ", parser.getClass(): " + parser.getClass());
+        Map<String, Map<String, Collection<String>>> data = (Map<String, Map<String, Collection<String>>>) parser.getData();
+        Set<T> result = null;
+        URI target;
+        URIResolver linkResolver = new URIResolver(base.getURI());
+        if (data != null){
+            for (String tag : data.keySet()){
+                if (!in(tag, SUPPORTED_TAGS)) continue;
+
+                Map<String, Collection<String>> attrs = data.get(tag);
+                for (String attr : attrs.keySet()){
+                    Collection<String> unresolvedLinks = attrs.get(attr);
+
+                    for (String unresolvedLink : unresolvedLinks){
+                        if (unresolvedLink == null) continue;
+                        if ("A".equalsIgnoreCase(tag) && JS_ANCHOR.matcher(unresolvedLink.trim()).matches())
+                            continue;
+
+                        if ("STYLE".equals(tag) || "STYLE".equals(attr)){
+                            Matcher matcher = CSS_BG.matcher(unresolvedLink);
+                            if (matcher.matches()){
+                                unresolvedLink = matcher.group(1);
+                                if (unresolvedLink == null) continue;
+                            } else{
+                                continue;
+                            }
+                        }
+
+                        if (result == null) result = new HashSet<T>();
+                        target = linkResolver.resolve(unresolvedLink);
+                        if (target != null) result.add((T) new Link(target));
+                    }
+                }
+            }
+        }
+        result = extracted(base, result);
+        return result;
+    }
+
+    private boolean in(String target, String[] set){
+        for (String item : set){
+            if (item.equals(target)) return true;
+        }
+        return false;
+    }
+
+}

Added: incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/AbstractFetcher.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/AbstractFetcher.java?rev=810273&view=auto
==============================================================================
--- incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/AbstractFetcher.java (added)
+++ incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/AbstractFetcher.java Tue Sep  1 22:11:29 2009
@@ -0,0 +1,94 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.crawler.fetcher;
+
+import org.apache.droids.crawler.fetcher.delay.Delay;
+import org.apache.droids.crawler.Link;
+import org.apache.droids.crawler.filter.FetchFilter;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.beans.factory.annotation.Qualifier;
+
+import javax.annotation.PostConstruct;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Comparator;
+import java.util.Collections;
+
+/**
+ * @param <T>
+ */
+public abstract class AbstractFetcher<T extends Link> implements Fetcher<T>{
+    @Autowired(required = false) protected List<FetchFilter<T>> filters;
+    @Autowired(required = false) protected List<Delay<T>> delays;
+    @Autowired(required = false) @Qualifier("fetcher.filterComparator") protected Comparator filterComparator;
+
+    @PostConstruct public Fetcher<T> init(){
+        if (filters != null && filterComparator != null) Collections.sort(filters, filterComparator);
+        return this;
+    }
+
+    public void requestReady(T link){
+        if (getFilters() != null) for (FetchFilter<T> filter : getFilters()) filter.requestReady(link, this);
+    }
+
+    public Fetcher<T> fetched(T link){
+        Fetcher<T> fetcher = this;
+        if (getFilters() != null) for (FetchFilter<T> filter : getFilters()) fetcher = filter.fetched(link, this);
+        if (fetcher != null) link.setState(Link.State.FETCHED);
+        return fetcher;
+    }
+
+    public boolean addFilter(FetchFilter<T> filter){
+        if (filters == null) filters = new ArrayList<FetchFilter<T>>();
+        return filters.add(filter);
+    }
+
+    public boolean removeFilter(FetchFilter<T> filter){
+        return filters != null && filters.remove(filter);
+    }
+
+    public void setFilters(List<FetchFilter<T>> filters){
+        this.filters = filters;
+    }
+
+    public List<FetchFilter<T>> getFilters(){
+        return filters;
+    }
+
+    public boolean hasFilter(){
+        return filters != null && filters.size() > 0;
+    }
+
+    public Delay<T> delay(T link){
+        if (getDelays() == null){
+            for (Delay delay : getDelays()){
+                if (delay.matches(link)) return delay.delay();
+            }
+        }
+        return null;
+    }
+
+    public List<Delay<T>> getDelays(){
+        return delays;
+    }
+
+    public void setDelays(List<Delay<T>> delays){
+        this.delays = delays;
+    }
+
+
+}

Added: incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/DefaultFetcherFactory.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/DefaultFetcherFactory.java?rev=810273&view=auto
==============================================================================
--- incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/DefaultFetcherFactory.java (added)
+++ incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/DefaultFetcherFactory.java Tue Sep  1 22:11:29 2009
@@ -0,0 +1,93 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.crawler.fetcher;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.droids.crawler.Link;
+import org.apache.droids.crawler.util.WeightComparator;
+import org.apache.droids.crawler.util.ParamUtils;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.beans.factory.annotation.Qualifier;
+import org.springframework.context.ApplicationContext;
+import org.springframework.stereotype.Component;
+
+import javax.annotation.PostConstruct;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.List;
+import java.util.Map;
+
+@Component
+public class DefaultFetcherFactory<T extends Link> implements FetcherFactory<T>{
+    protected static Log log = LogFactory.getLog(DefaultFetcherFactory.class);
+    @Autowired protected ApplicationContext context;
+    @Autowired(required = false) protected List<Fetcher> fetchers;
+    @Autowired(required = false) @Qualifier("fetcher.comparator") protected Comparator fetcherComparator;
+
+    public DefaultFetcherFactory(){
+    }
+
+    public String toString(){
+        return super.toString() + " - fetchers: " + fetchers;// + ", httpClient: " + httpClient;
+    }
+
+    @PostConstruct
+    public void init(){
+        if (fetchers != null){
+            Comparator comparator = fetcherComparator != null ? fetcherComparator : new WeightComparator();
+            Collections.sort(fetchers, comparator);
+        }
+        if (log.isDebugEnabled()){
+            log.debug("initialized - fetchers: " + fetchers);
+            log.debug("\tfilters[0].filters: " + (fetchers != null && fetchers.size() > 0 ? fetchers.get(0).getFilters() : null));
+        }
+    }
+
+    public Fetcher<T> newFetcher(T link){
+        if (link == null) throw new IllegalArgumentException("link is null");
+        if (link.containsKey("fetcher"))
+            return (Fetcher<T>) ParamUtils.resolve(link, "fetcher", Fetcher.class, context);
+
+        for (Fetcher fetcher : fetchers){
+            if (fetcher.matches(link)){
+                String[] beanNames = context.getBeanNamesForType(fetcher.getClass());
+                if (beanNames == null || beanNames.length <= 0) return null;
+                return context.getBean(beanNames[0], Fetcher.class);
+            }
+        }
+
+        if (log.isInfoEnabled()) log.info("newFetcher() - cannot find a supported parser - link: " + link);
+        return null;
+    }
+
+    public List<Fetcher> getFetchers(){
+        return fetchers;
+    }
+
+    public void setFetchers(List<Fetcher> fetchers){
+        this.fetchers = fetchers;
+    }
+
+    public Comparator getFetcherComparator(){
+        return fetcherComparator;
+    }
+
+    public void setFetcherComparator(Comparator fetcherComparator){
+        this.fetcherComparator = fetcherComparator;
+    }
+}

Added: incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/Fetcher.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/Fetcher.java?rev=810273&view=auto
==============================================================================
--- incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/Fetcher.java (added)
+++ incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/Fetcher.java Tue Sep  1 22:11:29 2009
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.crawler.fetcher;
+
+import net.jcip.annotations.NotThreadSafe;
+import org.apache.droids.core.Entity;
+import org.apache.droids.crawler.Link;
+import org.apache.droids.crawler.fetcher.delay.Delay;
+import org.apache.droids.crawler.filter.FetchFilter;
+import org.apache.droids.crawler.filter.FilterSupport;
+import org.apache.droids.crawler.util.LinkMatcher;
+
+import java.io.Serializable;
+import java.util.Map;
+
+/**
+ * First step of the 3-step crawling process after a link is polled from the Queue ( fetch -> parse -> extract )
+ * <p/>
+ * Create by @FetcherFactory, only one Fetcher will be used for any Link. After fetched, statusCode and entity will be
+ * prepared (if available).
+ * <p/>
+ * Use a @FetchFilter if it is necessary to access the Http Request and Response.
+ */
+@NotThreadSafe
+public interface Fetcher<T extends Link> extends LinkMatcher<T>, FilterSupport<FetchFilter<T>>, Serializable{
+    static final int SUCCESS = 200;
+
+    /**
+     * @param params use null if there is no argument
+     */
+    Fetcher<T> fetch(T link, Map params) throws FetcherException;
+
+    int getStatusCode();
+
+    Entity getEntity();
+
+    /**
+     * if a delay is configured, the fetcher shall make the delay before fetch.
+     */
+    Delay<T> getDelay();
+
+    /**
+     * a fetcher must be reset before re-use
+     */
+    void reset();
+
+}

Added: incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/FetcherException.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/FetcherException.java?rev=810273&view=auto
==============================================================================
--- incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/FetcherException.java (added)
+++ incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/FetcherException.java Tue Sep  1 22:11:29 2009
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.crawler.fetcher;
+
+import org.apache.droids.crawler.CrawlerException;
+
+public class FetcherException extends CrawlerException {
+    public FetcherException() {
+    }
+
+    public FetcherException(String message) {
+        super(message);
+    }
+
+    public FetcherException(String message, Throwable cause) {
+        super(message, cause);
+    }
+
+    public FetcherException(Throwable cause) {
+        super(cause);
+    }
+}

Added: incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/FetcherFactory.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/FetcherFactory.java?rev=810273&view=auto
==============================================================================
--- incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/FetcherFactory.java (added)
+++ incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/FetcherFactory.java Tue Sep  1 22:11:29 2009
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.crawler.fetcher;
+
+import org.apache.droids.crawler.Link;
+
+public interface FetcherFactory<T extends Link>{
+    Fetcher<T> newFetcher(T link);
+}

Added: incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/HttpFetcher.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/HttpFetcher.java?rev=810273&view=auto
==============================================================================
--- incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/HttpFetcher.java (added)
+++ incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/HttpFetcher.java Tue Sep  1 22:11:29 2009
@@ -0,0 +1,151 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.crawler.fetcher;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.droids.core.Entity;
+import org.apache.droids.crawler.Link;
+import org.apache.droids.crawler.filter.FetchFilter;
+import org.apache.droids.crawler.fetcher.http.HttpHeaderSupport;
+import org.apache.droids.crawler.fetcher.http.CrawlerHttpClient;
+import org.apache.droids.crawler.fetcher.delay.Delay;
+import org.apache.droids.crawler.util.Weighted;
+import org.apache.http.HttpResponse;
+import org.apache.http.client.HttpClient;
+import org.apache.http.client.methods.HttpGet;
+import org.apache.http.client.methods.HttpUriRequest;
+import org.springframework.beans.factory.annotation.Autowired;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URI;
+import java.util.Map;
+
+import net.jcip.annotations.NotThreadSafe;
+
+import javax.annotation.PostConstruct;
+
+@NotThreadSafe
+public class HttpFetcher<T extends Link> extends AbstractFetcher<T> implements Fetcher<T>, HttpHeaderSupport, Weighted{
+    protected static Log log = LogFactory.getLog(HttpFetcher.class);
+    @Autowired(required = false) protected HttpClient httpClient;
+    protected HttpUriRequest request;
+    protected HttpResponse response;
+    protected Entity entity;
+    protected Delay delay;
+
+    public HttpFetcher(){
+    }
+
+    public HttpFetcher(HttpClient httpClient){
+        this.httpClient = httpClient;
+    }
+
+    @PostConstruct public HttpFetcher init(){
+        if (log.isDebugEnabled()) log.debug("init() - filters: " + filters + ", httpClient: " + httpClient);
+        return this;
+    }
+
+    public boolean matches(T link){
+        URI uri = link.getURI();
+        return "http".equalsIgnoreCase(uri.getScheme()) || "https".equalsIgnoreCase(uri.getScheme());
+    }
+
+
+    public Fetcher fetch(T link) throws FetcherException{
+        if (link.containsKey("fetch.params")) return fetch(link, (Map) link.get("fetch.params"));
+        else return fetch(link, null);
+    }
+
+    public Fetcher fetch(T link, Map params) throws FetcherException{
+        if (log.isTraceEnabled()) log.trace("to fetch - link: " + link + ", params: " + params);
+        try{
+            if (httpClient == null) httpClient = createHttpClient(link, params);
+            request = new HttpGet(link.getURI());
+            this.requestReady(link);
+            if (getDelay() != null) getDelay().delay();
+            response = httpClient.execute(request);
+            entity = new Entity(response.getEntity());
+            if (log.isDebugEnabled())
+                log.debug("fetch() - fetched - uri: " + link.getURI() + ", response: " + response);
+            return fetched(link);
+        } catch (IOException e){
+            throw new FetcherException(e);
+        }
+    }
+
+    /**
+     * Create a default HttpClient. It is recommended to inject a HttpClient instead of overriding this method.
+     */
+    protected HttpClient createHttpClient(T link, Map params){
+        if (params != null && params.containsKey("httpClient")){
+            return (HttpClient) params.get("httpClient");
+        } else{
+            return new CrawlerHttpClient();
+        }
+    }
+
+    public HttpResponse getResponse(){
+        return this.response != null ? this.response : null;
+    }
+
+    public int getStatusCode(){
+        return this.response != null ? this.response.getStatusLine().getStatusCode() : -1;
+    }
+
+    public InputStream getContent(){
+        return this.entity != null ? this.entity.getContent() : null;
+    }
+
+    public Entity getEntity(){
+        return this.entity;
+    }
+
+    public Delay<T> getDelay(){
+        return this.delay;
+    }
+
+
+    public void reset(){
+        this.entity = null;
+        this.request = null;
+        this.response = null;
+    }
+
+    public void addHttpHeader(String key, String value){
+        request.addHeader(key, value);
+    }
+
+    public void setHttpHeader(String key, String value){
+        request.setHeader(key, value);
+    }
+
+    public int getWeight(){
+        return 10;
+    }
+
+
+    public HttpClient getHttpClient(){
+        return httpClient;
+    }
+
+    public void setHttpClient(HttpClient httpClient){
+        this.httpClient = httpClient;
+    }
+
+}

Added: incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/appengine/AppEngineFetcher.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/appengine/AppEngineFetcher.java?rev=810273&view=auto
==============================================================================
--- incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/appengine/AppEngineFetcher.java (added)
+++ incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/appengine/AppEngineFetcher.java Tue Sep  1 22:11:29 2009
@@ -0,0 +1,230 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.crawler.fetcher.appengine;
+
+import com.google.appengine.api.urlfetch.*;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.droids.core.Entity;
+import org.apache.droids.crawler.CrawlerService;
+import org.apache.droids.crawler.Link;
+import org.apache.droids.crawler.fetcher.AbstractFetcher;
+import org.apache.droids.crawler.fetcher.Fetcher;
+import org.apache.droids.crawler.fetcher.FetcherException;
+import org.apache.droids.crawler.fetcher.delay.Delay;
+import org.apache.droids.crawler.fetcher.http.HttpHeaderSupport;
+import org.apache.droids.crawler.filter.FetchFilter;
+
+import java.io.InputStream;
+import java.net.URI;
+import java.net.URL;
+import java.util.HashMap;
+import java.util.Map;
+
+/**
+ * Single thread use Fetcher for Google App Engine.
+ * <p/>
+ * TODO: find a way to use stream fetching in GAE
+ * the current implementation uses URLFetchService that fetch data as a byte[]. Given that there is a limitation
+ * in GAE that can process data up to 1M, there is a need to look for a method to do streaming
+ */
+public class AppEngineFetcher<T extends Link> extends AbstractFetcher<T> implements Fetcher<T>, HttpHeaderSupport{
+    protected static Log log = LogFactory.getLog(AppEngineFetcher.class);
+    public static final long serialVersionUID = CrawlerService.DEFAULT_SERIALVERSIONID;
+    protected transient URLFetchService urlFetchService;
+    protected transient HTTPRequest request;
+    protected transient HTTPResponse response;
+    protected Entity entity;
+    protected static Map<String, String> requestHeaders = new HashMap<String, String>();
+    protected int statusCode;
+    protected int autoRetries = 5;//total 5 times
+    protected int retryDelay = 5000;
+
+    static{
+        requestHeaders.put("User-Agent", "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.1.1) Gecko/20090715 Firefox/3.5.1 GTB5");
+        requestHeaders.put("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,**/*//*;q=0.8");
+        requestHeaders.put("Accept-Language", "en-us,en;q=0.5");
+        requestHeaders.put("Accept-Encoding", "gzip,deflate");
+        requestHeaders.put("Accept-Charset", "ISO-8859-1,utf-8;q=0.7,*;q=0.7");
+        requestHeaders.put("Keep-Alive", "300");
+        requestHeaders.put("Connection", "keep-alive");
+        requestHeaders.put("Pragma", "no-cache");
+        requestHeaders.put("Cache-Control", "no-cache");
+    }
+
+    /*protected transient HTTPRequest request;
+ protected transient HTTPResponse response;*/
+    protected Delay delay;
+
+    public boolean matches(T link){
+        if (link == null) return false;
+        URI uri = link.getURI();
+        return "http".equalsIgnoreCase(uri.getScheme()) || "https".equalsIgnoreCase(uri.getScheme());
+    }
+
+    public Fetcher fetch(T link) throws FetcherException{
+        if (link == null) return null;
+        if (link.containsKey("fetch.params")) return fetch(link, (Map) link.get("fetch.params"));
+        else return fetch(link, null);
+    }
+
+    public Fetcher fetch(T link, Map params) throws FetcherException{
+        if (link == null) return null;
+
+        if (urlFetchService == null) urlFetchService = URLFetchServiceFactory.getURLFetchService();
+
+        for (int i = 0; i <= autoRetries; i++){
+            if (log.isTraceEnabled()) log.trace("fetch() - link: " + link + ", params: " + params + ", i: " + i);
+            try{
+                URL url = link.getURI().toURL();
+                request = new HTTPRequest(url);
+                request.getFetchOptions().followRedirects().allowTruncate();
+                for (String header : requestHeaders.keySet()){
+                    request.addHeader(new HTTPHeader(header, requestHeaders.get(header)));
+                }
+                if (getFilters() != null) for (FetchFilter filter : getFilters()) filter.requestReady(link, this);
+
+                if (getDelay() != null) getDelay().delay();
+                response = urlFetchService.fetch(request);
+
+                if (log.isTraceEnabled())
+                    log.trace("fetched() - fetched, response.responseCode: " + response.getResponseCode() + ", response: " + response);
+
+                this.statusCode = response.getResponseCode();
+                this.entity = new Entity(response.getContent());
+
+                if (log.isTraceEnabled()){
+                    log.trace("\turl: " + url + ", headers.size(): " + response.getHeaders().size());
+                }
+
+                for (HTTPHeader header : response.getHeaders()){
+                    if (log.isTraceEnabled()){
+                        log.trace("\t\tname: " + header.getName() + ", value: " + header.getValue());
+                    }
+
+                    if (header.getName().equals("Content-Type")){
+                        this.entity.setContentType(header.getValue());
+                    }
+                }
+
+                return fetched(link);
+            } catch (Exception e){
+                log.error("error in fetch() - e.message: " + e.getMessage() + ", i (retry index): " + i + ", link: " + link + ", request: " + request + ", request.headers: " + (request != null ? request.getHeaders() : "null"));
+                if (i == autoRetries){throw new FetcherException(e);} else{
+                    try{
+                        if (log.isTraceEnabled()) log.trace("to sleep and retry - retryDelay: " + retryDelay + "ms");
+                        Thread.sleep(retryDelay);
+                    } catch (InterruptedException e1){
+                    }
+                }
+            }
+        }
+        return null;
+        /*
+if (urlFetchService == null) urlFetchService = URLFetchServiceFactory.getURLFetchService();
+try{
+    request = new HTTPRequest(link.getURI().toURL()); //TODO add default parameter
+    if (getFilters() != null) for (FetchFilter filter : getFilters()) filter.requestReady(link, this);
+
+    if (getDelay() != null) getDelay().delay();
+    response = urlFetchService.fetch(request);
+    entity = new Entity(response.getContent());
+
+    for (HTTPHeader header : response.getHeaders()){
+        if ("Content-Type".equalsIgnoreCase(header.getName())){
+            String contentTypeHeader = header.getValue();
+            if (contentTypeHeader.indexOf(';') != -1){
+                entity.setCharset(contentTypeHeader.substring(contentTypeHeader.indexOf(';'), contentTypeHeader.length()).trim());
+                entity.setContentType(contentTypeHeader.substring(0, contentTypeHeader.indexOf(';')));
+            } else{
+                entity.setContentType(contentTypeHeader);
+            }
+        }
+        *//*else if ("Content-Length".equalsIgnoreCase(header.getName())){
+                    entity.setContentLength(Long.parseLong(header.getValue().trim()));
+                }*//*
+            }
+
+            if (getFilters() != null) for (FetchFilter filter : getFilters()) filter.fetched(link, this);
+            if (log.isTraceEnabled())
+                log.trace("fetched - entity: " + entity + ", link: " + link + ", filters: " + getFilters());
+            return this;
+        } catch (IOException e){
+            log.error("fetch() - error - link: " + link + ", params: " + params + ", exception: " + e);
+            throw new FetcherException(e);
+        }*/
+    }
+
+    public int getStatusCode(){
+        return this.statusCode;
+    }
+
+
+    public void reset(){
+        this.entity = null;
+        /*this.request = null;
+        this.response = null;*/
+    }
+
+    public void addHttpHeader(String key, String value){
+        //request.addHeader(new HTTPHeader(key, value));
+    }
+
+    public void setHttpHeader(String key, String value){
+        //request.setHeader(new HTTPHeader(key, value));
+    }
+
+    public InputStream getContent() throws FetcherException{
+        return entity.getContent();
+    }
+
+    /*public int getStatusCode(){
+        return response.getResponseCode();
+    }*/
+
+    public Entity getEntity(){
+        return entity;
+    }
+
+    public Delay<T> getDelay(){
+        return this.delay;
+    }
+
+    /*public URLFetchService getUrlFetchService(){
+        return urlFetchService;
+    }
+
+    public void setUrlFetchService(URLFetchService urlFetchService){
+        this.urlFetchService = urlFetchService;
+    }
+
+    public HTTPRequest getRequest(){
+        return request;
+    }
+
+    public void setRequest(HTTPRequest request){
+        this.request = request;
+    }
+
+    public HTTPResponse getResponse(){
+        return response;
+    }
+
+    public void setResponse(HTTPResponse response){
+        this.response = response;
+    }*/
+}

Added: incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/delay/Delay.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/delay/Delay.java?rev=810273&view=auto
==============================================================================
--- incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/delay/Delay.java (added)
+++ incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/delay/Delay.java Tue Sep  1 22:11:29 2009
@@ -0,0 +1,28 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.crawler.fetcher.delay;
+
+import org.apache.droids.crawler.Link;
+import org.apache.droids.crawler.util.LinkMatcher;
+
+public interface Delay<T extends Link> extends LinkMatcher<T>{
+
+    /**
+     * Delay the current thread
+     */
+    public Delay<T> delay();
+}

Added: incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/http/CrawlerHttpClient.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/http/CrawlerHttpClient.java?rev=810273&view=auto
==============================================================================
--- incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/http/CrawlerHttpClient.java (added)
+++ incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/http/CrawlerHttpClient.java Tue Sep  1 22:11:29 2009
@@ -0,0 +1,140 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.crawler.fetcher.http;
+
+import org.apache.http.HttpVersion;
+import org.apache.http.HttpResponseInterceptor;
+import org.apache.http.HttpResponse;
+import org.apache.http.HttpException;
+import org.apache.http.entity.BufferedHttpEntity;
+import org.apache.http.conn.ClientConnectionManager;
+import org.apache.http.conn.params.ConnManagerParams;
+import org.apache.http.conn.params.ConnPerRouteBean;
+import org.apache.http.conn.scheme.PlainSocketFactory;
+import org.apache.http.conn.scheme.Scheme;
+import org.apache.http.conn.scheme.SchemeRegistry;
+import org.apache.http.conn.ssl.SSLSocketFactory;
+import org.apache.http.impl.client.DefaultHttpClient;
+import org.apache.http.impl.conn.tsccm.ThreadSafeClientConnManager;
+import org.apache.http.params.BasicHttpParams;
+import org.apache.http.params.HttpConnectionParams;
+import org.apache.http.params.HttpParams;
+import org.apache.http.params.HttpProtocolParams;
+import org.apache.http.protocol.HTTP;
+import org.apache.http.protocol.HttpContext;
+
+import javax.annotation.PostConstruct;
+import java.io.IOException;
+
+public class CrawlerHttpClient extends DefaultHttpClient{
+    protected int connectionTimeout = 60000;
+    protected String userAgent = "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.1.1) Gecko/20090715 Firefox/3.0.10";
+    //Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US; rv:1.9.1.1) Gecko/20090715 Firefox/3.5.1
+    protected int maxTotalConnections = 2;
+    protected int maxConnectionsPerRoute = 2;
+    protected boolean useBuffer = true;
+
+    public CrawlerHttpClient(int conn){
+        this.maxTotalConnections = conn;
+    }
+
+    public CrawlerHttpClient(ClientConnectionManager clientConnectionManager, HttpParams httpParams){
+        super(clientConnectionManager, httpParams);
+    }
+
+    public CrawlerHttpClient(HttpParams httpParams){
+        super(httpParams);
+    }
+
+    public CrawlerHttpClient(){
+
+    }
+
+    @PostConstruct
+    public CrawlerHttpClient init(){
+        if (useBuffer){
+            this.addResponseInterceptor(new HttpResponseInterceptor(){
+                public void process(HttpResponse httpResponse, HttpContext httpContext) throws HttpException, IOException{
+                    httpResponse.setEntity(new BufferedHttpEntity(httpResponse.getEntity()));
+                }
+            });
+        }
+        return this;
+    }
+
+    public String toString(){
+        return super.toString() + " - maxTotalConnections: " + maxTotalConnections +
+                ", maxConnectionsPerRoute: " + maxConnectionsPerRoute + ", connectionTimeout: " + connectionTimeout +
+                ", userAgent: " + userAgent;
+    }
+
+
+    @Override protected HttpParams createHttpParams(){
+        HttpParams params = new BasicHttpParams();
+        ConnManagerParams.setMaxTotalConnections(params, maxTotalConnections);
+        HttpProtocolParams.setVersion(params, HttpVersion.HTTP_1_1);
+        HttpProtocolParams.setUserAgent(params, userAgent);
+        HttpProtocolParams.setContentCharset(params, HTTP.DEFAULT_CONTENT_CHARSET);
+        HttpProtocolParams.setUseExpectContinue(params, true);
+        HttpConnectionParams.setConnectionTimeout(params, connectionTimeout);
+        HttpConnectionParams.setTcpNoDelay(params, false);
+        HttpConnectionParams.setStaleCheckingEnabled(params, false);
+        HttpConnectionParams.setSocketBufferSize(params, 8192);
+        ConnManagerParams.setMaxConnectionsPerRoute(params, new ConnPerRouteBean(maxConnectionsPerRoute));
+        ConnManagerParams.setTimeout(params, 0x493e0L);
+        return params;
+    }
+
+    @Override protected ClientConnectionManager createClientConnectionManager(){
+        SchemeRegistry schemeRegistry = new SchemeRegistry();
+        schemeRegistry.register(new Scheme("http", PlainSocketFactory.getSocketFactory(), 80));
+        schemeRegistry.register(new Scheme("https", SSLSocketFactory.getSocketFactory(), 443));
+        return new ThreadSafeClientConnManager(getParams(), schemeRegistry);
+    }
+
+    public int getConnectionTimeout(){
+        return connectionTimeout;
+    }
+
+    public void setConnectionTimeout(int connectionTimeout){
+        this.connectionTimeout = connectionTimeout;
+    }
+
+    public String getUserAgent(){
+        return userAgent;
+    }
+
+    public void setUserAgent(String userAgent){
+        this.userAgent = userAgent;
+    }
+
+    public int getMaxTotalConnections(){
+        return maxTotalConnections;
+    }
+
+    public void setMaxTotalConnections(int maxTotalConnections){
+        this.maxTotalConnections = maxTotalConnections;
+    }
+
+    public int getMaxConnectionsPerRoute(){
+        return maxConnectionsPerRoute;
+    }
+
+    public void setMaxConnectionsPerRoute(int maxConnectionsPerRoute){
+        this.maxConnectionsPerRoute = maxConnectionsPerRoute;
+    }
+}

Added: incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/http/HttpHeaderSupport.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/http/HttpHeaderSupport.java?rev=810273&view=auto
==============================================================================
--- incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/http/HttpHeaderSupport.java (added)
+++ incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/http/HttpHeaderSupport.java Tue Sep  1 22:11:29 2009
@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.crawler.fetcher.http;
+
+public interface HttpHeaderSupport {
+
+    /**
+     * Notice: it works only in preFetch event of FetchFilter
+     */
+    void addHttpHeader(String key, String value);
+
+    /**
+     * Notice: it works only in preFetch event of FetchFilter
+     */
+    void setHttpHeader(String key, String value);
+}

Added: incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/filter/DepthFilter.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/filter/DepthFilter.java?rev=810273&view=auto
==============================================================================
--- incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/filter/DepthFilter.java (added)
+++ incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/filter/DepthFilter.java Tue Sep  1 22:11:29 2009
@@ -0,0 +1,118 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.crawler.filter;
+
+import org.apache.droids.crawler.Link;
+import org.apache.droids.crawler.filter.LinkFilter;
+import org.apache.droids.crawler.extractor.Extractor;
+import org.apache.droids.crawler.extractor.ExtractFilter;
+import org.apache.droids.crawler.fetcher.Fetcher;
+import org.apache.droids.crawler.parser.Parser;
+
+import java.util.Set;
+import java.util.HashSet;
+import java.util.List;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+public class DepthFilter<T extends Link> implements LinkFilter<T>, ExtractFilter<T>{
+    protected static Log log = LogFactory.getLog(DepthFilter.class);
+    public static final int DEFAULT_DEPTH = 0;
+    protected int maxDepth;
+    protected AtomicInteger removeCounter = new AtomicInteger();
+
+    public DepthFilter(){
+        this.maxDepth = DEFAULT_DEPTH;
+    }
+
+    public DepthFilter(int maxDepth){
+        this.maxDepth = maxDepth;
+    }
+
+    public String toString(){
+        return super.toString() + " - maxDepth: " + maxDepth;
+    }
+
+
+    /**
+     * add "depth" data to any polled link without "depth"
+     */
+    public T polled(T link){
+        if (!link.containsKey("depth")){
+            link.put("depth", 0);
+            if (log.isTraceEnabled()) log.trace("polled() - added depth - link: " + link);
+        }
+        return link;
+    }
+
+
+    public Set<T> extracted(T base, Extractor<T, ? extends Parser> extractor, Set<T> links){
+        if (links == null) return null;
+        if (log.isTraceEnabled())
+            log.trace("maxDepth: " + maxDepth + ", base: " + base + ", links.size(): " + links.size());
+        Set<T> removeList = new HashSet<T>();
+        for (T link : links){
+            if (!link.containsKey("depth")){
+                int depth = base.containsKey("depth") ? (Integer) base.get("depth") + 1 : 1;
+                link.put("depth", depth);
+                //if (log.isTraceEnabled()) log.trace("marked - depth: " + depth + ", link: " + link);
+            }
+
+            if ((Integer) link.get("depth") > maxDepth){
+                if (log.isTraceEnabled()) log.trace("rejecting - maxDepth: " + maxDepth + ", link: " + link);
+                removeList.add(link);
+            }
+        }
+
+        if (removeList.size() > 0){
+            removeCounter.addAndGet(removeList.size());
+            int linkSize = links.size();
+            links.removeAll(removeList);
+            if (log.isDebugEnabled())
+                log.debug("rejected " + removeList.size() + " link(s) - links.size(): " + linkSize + " -> " + links.size());
+        }
+
+        return links;
+    }
+
+    public Set<T> extractedAll(T base, List<Extractor<T, ? extends Parser>> extractor, Set<T> links){
+        if (log.isDebugEnabled()){
+            for (T link : links){
+                if (!link.containsKey("depth")){
+                    log.warn("extractedAll() - extracted link without 'weight' attribute - link: " + link);
+                }
+            }
+        }
+        return links;
+    }
+
+    public void completed(T link, Set<T> links){ }
+
+    public void failed(T link, Object object){ }
+
+    public int getMaxDepth(){
+        return maxDepth;
+    }
+
+    public void setMaxDepth(int maxDepth){
+        this.maxDepth = maxDepth;
+    }
+
+
+}

Added: incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/filter/FetchFilter.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/filter/FetchFilter.java?rev=810273&view=auto
==============================================================================
--- incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/filter/FetchFilter.java (added)
+++ incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/filter/FetchFilter.java Tue Sep  1 22:11:29 2009
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.crawler.filter;
+
+import org.apache.droids.crawler.Link;
+import org.apache.droids.crawler.fetcher.Fetcher;
+import org.apache.droids.crawler.filter.Filter;
+
+/**
+ * Implementation is expected to cast the fetcher to its implementation type and access it for processing.
+ * e.g. read the fetcher's http request or response data, and set any data to the Link
+ */
+public interface FetchFilter<T extends Link> extends Filter<T>{
+
+    /**
+     * After a request is prepared and before the content is fetched, and also before delay is applied
+     */
+    void requestReady(T link, Fetcher<T> fetcher);
+
+    /**
+     * filter the Link and result processed by a fetcher. implentation may, for example, wrap the Fetcher that provide
+     * a different getContent() so the parser may process the fetched content differently.
+     */
+    Fetcher<T> fetched(T link, Fetcher<T> fetcher);
+}

Added: incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/filter/Filter.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/filter/Filter.java?rev=810273&view=auto
==============================================================================
--- incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/filter/Filter.java (added)
+++ incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/filter/Filter.java Tue Sep  1 22:11:29 2009
@@ -0,0 +1,27 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.crawler.filter;
+
+import java.io.Serializable;
+
+import org.apache.droids.crawler.Link;
+
+/**
+ * This is the root of all of filters
+ */
+public interface Filter<T extends Link> extends Serializable{
+}

Added: incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/filter/FilterSupport.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/filter/FilterSupport.java?rev=810273&view=auto
==============================================================================
--- incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/filter/FilterSupport.java (added)
+++ incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/filter/FilterSupport.java Tue Sep  1 22:11:29 2009
@@ -0,0 +1,35 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.crawler.filter;
+
+import java.util.List;
+
+public interface FilterSupport<T> {
+
+    boolean addFilter(T filter);
+
+    boolean removeFilter(T filter);
+
+    /**
+     * Filter are applied in order
+     */
+    void setFilters(List<T> filters);
+
+    List<T> getFilters();
+
+    boolean hasFilter();
+}

Added: incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/filter/LinkFilter.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/filter/LinkFilter.java?rev=810273&view=auto
==============================================================================
--- incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/filter/LinkFilter.java (added)
+++ incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/filter/LinkFilter.java Tue Sep  1 22:11:29 2009
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.crawler.filter;
+
+import org.apache.droids.crawler.Link;
+import org.apache.droids.crawler.extractor.Extractor;
+import org.apache.droids.crawler.fetcher.Fetcher;
+import org.apache.droids.crawler.parser.Parser;
+
+import java.util.List;
+import java.util.Set;
+
+/**
+ * Filter, a.k.a. CrawlerFilter, may be used in different ways as follows:
+ * - as a classifier to append custom information to a Link that will be passed through in the whole process chain, and
+ * be replicated in a cluster environment
+ * - to stop the chain of processing. If any filter returns null, the Crawler/Droid/TaskMaster shall stop futher
+ * processing of the Link
+ * - to alter any data in the processing. e.g. in fetched(), the fetcher stores the fetched HttpEntity. A filter could
+ * alter the HttpEntity and
+ */
+public interface LinkFilter<T extends Link> extends Filter<T>{
+    //TODO consider to change to Enum
+    public static final int POLLED = 1, FETCHED = 2, PARSED = 3, EXTRACTED = 4, COMPLETED = 5, FAILED = -1;
+
+    /**
+     * filter the Link from the master Queue. The crawler/droid/taskmaster shall honer the filters and provide the
+     * returned link to obtain and feed to a filter.
+     */
+    public T polled(T link);
+
+    /**
+     public Fetcher<T> fetched(T link, Fetcher<T> fetcher);
+     public Parser<T> parsed(T link, Parser<T> parser);
+     public Set<T> extracted(T link, Extractor<T> extractor, Set<T> links);
+     public Set<T> extractedAll(T link, Set<T> links);
+     */
+
+    /**
+     * Completed processing of a link. Any threadlocal resource could be released.
+     */
+    public void completed(T link, final Set<T> links);
+
+    /**
+     * @param link
+     * @param object
+     */
+    public void failed(T link, Object object);
+}

Added: incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/filter/ParseFilter.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/filter/ParseFilter.java?rev=810273&view=auto
==============================================================================
--- incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/filter/ParseFilter.java (added)
+++ incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/filter/ParseFilter.java Tue Sep  1 22:11:29 2009
@@ -0,0 +1,16 @@
+package org.apache.droids.crawler.filter;
+
+import org.apache.droids.crawler.Link;
+import org.apache.droids.crawler.parser.Parser;
+
+public interface ParseFilter<T extends Link, D> extends Filter<T>{
+
+    /**
+     * This provide an inteception point right before the parse() operation. The Parser and any underlying parsers or
+     * supporting utilities shall all be ready. Implementation should cast the parser to its implementation and access
+     * its internal variables for operation.
+     */
+    void parserReady(T link, Parser<T, D> parser);
+
+    Parser<T, D> parsed(T link, Parser<T, D> parser);
+}

Added: incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/filter/StateFilter.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/filter/StateFilter.java?rev=810273&view=auto
==============================================================================
--- incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/filter/StateFilter.java (added)
+++ incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/filter/StateFilter.java Tue Sep  1 22:11:29 2009
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.crawler.filter;
+
+import org.apache.droids.crawler.Link;
+import org.apache.droids.crawler.parser.Parser;
+import org.apache.droids.crawler.fetcher.Fetcher;
+import org.apache.droids.crawler.util.Weighted;
+import org.apache.droids.crawler.extractor.ExtractFilter;
+import org.apache.droids.crawler.extractor.Extractor;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+import java.util.Set;
+import java.util.List;
+
+public class StateFilter<T extends Link> implements LinkFilter<T>, FetchFilter<T>, ParseFilter<T, Object>, ExtractFilter<T>, Weighted{
+    protected static Log log = LogFactory.getLog(StateFilter.class);
+
+
+    public int getWeight(){
+        return 1000;
+    }
+
+    public T polled(T link){
+        link.setState(Link.State.POLLED);
+        if (log.isDebugEnabled()) log.debug("polled() - set link to POLLED - link: " + link);
+        return link;
+    }
+
+
+    public void requestReady(T link, Fetcher<T> fetcher){
+    }
+
+    public Fetcher<T> fetched(T link, Fetcher<T> fetcher){
+        link.setState(Link.State.FETCHED);
+        return fetcher;
+    }
+
+    public void parserReady(T link, Parser<T, Object> parser){
+    }
+
+    public Parser<T, Object> parsed(T link, Parser<T, Object> parser){
+        link.setState(Link.State.PARSED);
+        if (log.isDebugEnabled()) log.debug("parsed() - set link to PARSED - link: " + link);
+        return parser;
+    }
+
+    public Set<T> extracted(T base, Extractor<T, ? extends Parser> tExtractor, Set<T> links){
+        return links;
+    }
+
+    public Set<T> extractedAll(T base, List<Extractor<T, ? extends Parser>> extractors, Set<T> links){
+        base.setState(Link.State.EXTRACTED);
+        return links;
+    }
+
+    public void completed(T link, Set<T> links){
+        link.setState(Link.State.COMPLETED);
+    }
+
+    public void failed(T link, Object object){
+        //To change body of implemented methods use File | Settings | File Templates.
+    }
+
+
+}