You are viewing a plain text version of this content. The canonical link for it is here.
Posted to droids-commits@incubator.apache.org by mi...@apache.org on 2009/09/01 22:11:33 UTC
svn commit: r810273 [2/4] - in /incubator/droids/trunk/droids-crawler: ./
docs/ docs/diagrams/ src/ src/main/ src/main/groovy/ src/main/java/
src/main/java/org/ src/main/java/org/apache/
src/main/java/org/apache/droids/ src/main/java/org/apache/droids/...
Added: incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/StandaloneCrawlerController.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/StandaloneCrawlerController.java?rev=810273&view=auto
==============================================================================
--- incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/StandaloneCrawlerController.java (added)
+++ incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/StandaloneCrawlerController.java Tue Sep 1 22:11:29 2009
@@ -0,0 +1,120 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.crawler;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.droids.core.Worker;
+import org.apache.droids.crawler.util.CrawlerExecutorService;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.context.ApplicationContext;
+
+import java.util.Queue;
+
+/**
+ * This controller works without droids-core in a standalone manner.
+ *
+ * @param <T>
+ */
+public class StandaloneCrawlerController<T extends Link> extends AbstractCrawlerController<T>{
+ @Autowired ApplicationContext context;
+ static Log log = LogFactory.getLog(StandaloneCrawlerController.class);
+ protected boolean started = false;
+ @Autowired(required = false) protected CrawlerExecutorService executorService;
+ protected long tickleTime = 5000L;
+ protected long timeout = 30 * 1000L;
+ protected int threads = 1;
+
+ public boolean isStarted(){ return started; }
+
+ public void start() throws CrawlerException{
+ started = true;
+ if (log.isInfoEnabled())
+ log.info("start() - started: " + started + ", queue.size: " + queue.size() + ", queue: " + queue + ", executorService: " + executorService);
+ final StandaloneCrawlerController controller = this;
+
+ while (started){
+ if (!queue.isEmpty() && executorService.getActiveCount() < executorService.getMaximumPoolSize()){
+ Worker<T> worker = context.getBean("worker", Worker.class);
+ if (log.isInfoEnabled())
+ log.info("to execute worker: " + worker + ", queue.isEmpty(): " + queue.isEmpty() + ", executors: " + executorService);
+ executorService.execute(worker);
+ }
+
+ long now = System.currentTimeMillis();
+ if (executorService.getActiveCount() == 0 && (now - executorService.getLastCompleted() >= timeout)){
+ //normal shutdown case that no task is running, and graceful period is passed
+ started = false;
+ if (log.isInfoEnabled())
+ log.info("to stop - no active thread and graceful period is passed - now-lastCompleted: " + (now - executorService.getLastCompleted()) + ", timeout: " + timeout);
+ } else{
+ if (log.isDebugEnabled())
+ log.debug("to sleep for " + tickleTime + "ms - now-lastCompleted: " + (now - executorService.getLastCompleted()) + ", timeout: " + timeout);
+ try{
+ Thread.sleep(tickleTime);
+ } catch (InterruptedException e){
+ throw new CrawlerException(e.getMessage(), e);
+ }
+ }
+
+ //TODO add other condition including forceful termination
+ //TODO handle thread timeout. if a thread just run forever, this
+ }
+
+ if (log.isInfoEnabled()){
+ String fmt1 = "%1$-20s";
+ log.info(String.format(fmt1, "seeds: ") + seeds + ", seeds.size(): " + ((seeds != null) ? seeds.size() : 0));
+ log.info(String.format(fmt1, "queue: ") + queue + ", queue.size(): " + ((queue != null) ? queue.size() : 0));
+ log.info(String.format(fmt1, "filters: ") + filters);
+ log.info(String.format(fmt1, "crawlerService: ") + crawlerService);
+ }
+
+ started = false;
+ }
+
+ public CrawlerExecutorService getExecutorService(){
+ return executorService;
+ }
+
+ public void setExecutorService(CrawlerExecutorService executorService){
+ this.executorService = executorService;
+ }
+
+ public long getTickleTime(){
+ return tickleTime;
+ }
+
+ public void setTickleTime(long tickleTime){
+ this.tickleTime = tickleTime;
+ }
+
+ public long getTimeout(){
+ return timeout;
+ }
+
+ public void setTimeout(long timeout){
+ this.timeout = timeout;
+ }
+
+ public int getThreads(){
+ return threads;
+ }
+
+ public void setThreads(int threads){
+ this.threads = threads;
+ }
+}
Added: incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/extractor/AbstractExtractor.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/extractor/AbstractExtractor.java?rev=810273&view=auto
==============================================================================
--- incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/extractor/AbstractExtractor.java (added)
+++ incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/extractor/AbstractExtractor.java Tue Sep 1 22:11:29 2009
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.crawler.extractor;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.droids.crawler.Link;
+import org.apache.droids.crawler.CrawlerService;
+import org.apache.droids.crawler.parser.Parser;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.beans.factory.annotation.Qualifier;
+
+import javax.annotation.PostConstruct;
+import java.util.*;
+
+public abstract class AbstractExtractor<T extends Link, P extends Parser> implements Extractor<T, P>{
+ protected static Log log = LogFactory.getLog(AbstractExtractor.class);
+ public static final long serialVersionUID = CrawlerService.DEFAULT_SERIALVERSIONID;
+ @Autowired(required = false) protected List<ExtractFilter<T>> filters;
+ @Autowired(required = false) @Qualifier("extractor.filterComparator") protected Comparator filterComparator;
+
+ abstract public Set<T> extract(T base, P parser);
+
+ abstract public boolean matches(T link);
+
+ @PostConstruct Extractor<T, P> init(){
+ if (filters != null && filterComparator != null) Collections.sort(filters, filterComparator);
+ return this;
+ }
+
+ protected Set<T> extracted(T link, Set<T> result){
+ if (hasFilter()){
+ for (ExtractFilter<T> filter : getFilters()){
+ result = filter.extracted(link, this, result);
+ if (log.isTraceEnabled())
+ log.trace("filtered - filter: " + filter + ", result.size(): " + (result != null ? result.size() : 0));
+ }
+ }
+ link.setState(Link.State.EXTRACTED);
+ return result;
+ }
+
+ public boolean addFilter(ExtractFilter<T> filter){
+ if (filters == null) filters = new ArrayList<ExtractFilter<T>>();
+ return filters.add(filter);
+ }
+
+ public boolean removeFilter(ExtractFilter<T> filter){
+ return filters.remove(filter);
+ }
+
+ public void setFilters(List<ExtractFilter<T>> filters){
+ this.filters = filters;
+ }
+
+ public List<ExtractFilter<T>> getFilters(){
+ return filters;
+ }
+
+ public boolean hasFilter(){
+ return filters != null && filters.size() > 0;
+ }
+
+ public Comparator getFilterComparator(){
+ return filterComparator;
+ }
+
+ public void setFilterComparator(Comparator filterComparator){
+ this.filterComparator = filterComparator;
+ }
+}
Added: incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/extractor/ExtractFilter.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/extractor/ExtractFilter.java?rev=810273&view=auto
==============================================================================
--- incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/extractor/ExtractFilter.java (added)
+++ incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/extractor/ExtractFilter.java Tue Sep 1 22:11:29 2009
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.crawler.extractor;
+
+import org.apache.droids.crawler.Link;
+import org.apache.droids.crawler.filter.Filter;
+import org.apache.droids.crawler.parser.Parser;
+
+import java.util.List;
+import java.util.Set;
+
+public interface ExtractFilter<T extends Link> extends Filter<T>{
+
+ /**
+ * After all out-links are extracted by each extractors
+ */
+ Set<T> extracted(T base, Extractor<T, ? extends Parser> extractor, Set<T> links);
+
+ /**
+ * After all out-links are extracted by all extractors
+ */
+ Set<T> extractedAll(T base, List<Extractor<T, ? extends Parser>> extractors, Set<T> links);
+}
Added: incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/extractor/Extractor.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/extractor/Extractor.java?rev=810273&view=auto
==============================================================================
--- incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/extractor/Extractor.java (added)
+++ incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/extractor/Extractor.java Tue Sep 1 22:11:29 2009
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.crawler.extractor;
+
+import org.apache.droids.crawler.Link;
+import org.apache.droids.crawler.filter.FilterSupport;
+import org.apache.droids.crawler.util.LinkMatcher;
+import org.apache.droids.crawler.parser.Parser;
+
+import java.io.Serializable;
+import java.util.Set;
+
+/**
+ * Third of the 3-step crawling process after a link is polled from the Queue ( fetch -> parse -> extract )
+ * <p/>
+ * An Extractor extracts data from a Parser. It's primary purpose is to extract outlinks. And it is also
+ * designed for extracting any user data to store to a Link. By design, an extractor has to associate with a parser.
+ * <p/>
+ * Every link will be fetched and parsed once but may be extracted for multiple times by any number of matched
+ * extractor. The result will be aggregated.
+ *
+ * @param <T>
+ */
+public interface Extractor<T extends Link, P extends Parser> extends LinkMatcher<T>, FilterSupport<ExtractFilter<T>>, Serializable{
+
+ Set<T> extract(T base, P parser);
+
+}
Added: incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/extractor/ExtractorException.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/extractor/ExtractorException.java?rev=810273&view=auto
==============================================================================
--- incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/extractor/ExtractorException.java (added)
+++ incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/extractor/ExtractorException.java Tue Sep 1 22:11:29 2009
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.crawler.extractor;
+
+import org.apache.droids.crawler.CrawlerException;
+
+public class ExtractorException extends CrawlerException{
+ public ExtractorException(){
+ }
+
+ public ExtractorException(String message){
+ super(message);
+ }
+
+ public ExtractorException(String message, Throwable cause){
+ super(message, cause);
+ }
+
+ public ExtractorException(Throwable cause){
+ super(cause);
+ }
+}
Added: incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/extractor/HtmlElementLinkExtractor.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/extractor/HtmlElementLinkExtractor.java?rev=810273&view=auto
==============================================================================
--- incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/extractor/HtmlElementLinkExtractor.java (added)
+++ incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/extractor/HtmlElementLinkExtractor.java Tue Sep 1 22:11:29 2009
@@ -0,0 +1,117 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.crawler.extractor;
+
+import org.apache.droids.crawler.Link;
+import org.apache.droids.crawler.util.URIResolver;
+import org.apache.droids.crawler.parser.Parser;
+import org.apache.droids.crawler.parser.impl.NekoHtmlParser;
+import org.apache.droids.crawler.parser.impl.AbstractHierarchicalDataParser;
+import org.apache.droids.crawler.extractor.Extractor;
+
+import java.net.URI;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+import java.util.Collection;
+import java.util.regex.Pattern;
+import java.util.regex.Matcher;
+
+import org.apache.commons.logging.LogFactory;
+import org.apache.commons.logging.Log;
+
+/**
+ * This is a generic HTML link extractor that expects the parser returns data in a hierarchy as follows:
+ * [ ${tagName} : [ $attributeName: [ ${relativeLink1}, ${relativeLink2} ] as Set ]
+ * <p/>
+ * And it output a Set<Link>
+ *
+ * TODO: include DROIDS-59 from LinkExtractor
+ */
+public class HtmlElementLinkExtractor<T extends Link> extends AbstractExtractor<T, Parser<T, ?>> implements Extractor<T, Parser<T, ?>>{
+ protected static Log log = LogFactory.getLog(HtmlElementLinkExtractor.class);
+ protected static String[] SUPPORTED_TAGS = new String[]{"A", "LINK", "IMG", "SCRIPT", "AREA", "STYLE"};
+
+ // reference: http://java.sun.com/javase/6/docs/api/java/util/regex/Pattern.html
+ static final Pattern JS_ANCHOR = Pattern.compile("^JAVASCRIPT.*", Pattern.CASE_INSENSITIVE);
+ static final Pattern CSS_BG = Pattern.compile("^.*background-image\\s*:\\s*url\\(\\s*['\"]?([^'\"\\)\\s]*)['\"]?\\s*\\).*", Pattern.CASE_INSENSITIVE);
+ //static final Pattern CSS_BG = Pattern.compile("background-image\\s*:\\s*url\\s*\\(\\s*['\"]?(.*)['\"]?\\s*\\)", Pattern.CASE_INSENSITIVE);
+
+
+ public boolean matches(T link){
+ if (link.containsKey("parsed") && !(link.get("parsed", Parser.class).getData() instanceof Map)){ // parser.data must be Map, if existed
+ return false;
+ }
+
+ if (link.containsKey("contentType")){
+ if ("text/html".equals(link.get("contentType"))) return true;
+ }
+ return false;
+ }
+
+ /**
+ * parser.data must be Map<String, Map<String, Collection<String>>>
+ */
+ public Set<T> extract(T base, Parser<T, ?> parser){
+ if (log.isTraceEnabled())
+ log.trace("extract() - base: " + base + ", parser.getClass(): " + parser.getClass());
+ Map<String, Map<String, Collection<String>>> data = (Map<String, Map<String, Collection<String>>>) parser.getData();
+ Set<T> result = null;
+ URI target;
+ URIResolver linkResolver = new URIResolver(base.getURI());
+ if (data != null){
+ for (String tag : data.keySet()){
+ if (!in(tag, SUPPORTED_TAGS)) continue;
+
+ Map<String, Collection<String>> attrs = data.get(tag);
+ for (String attr : attrs.keySet()){
+ Collection<String> unresolvedLinks = attrs.get(attr);
+
+ for (String unresolvedLink : unresolvedLinks){
+ if (unresolvedLink == null) continue;
+ if ("A".equalsIgnoreCase(tag) && JS_ANCHOR.matcher(unresolvedLink.trim()).matches())
+ continue;
+
+ if ("STYLE".equals(tag) || "STYLE".equals(attr)){
+ Matcher matcher = CSS_BG.matcher(unresolvedLink);
+ if (matcher.matches()){
+ unresolvedLink = matcher.group(1);
+ if (unresolvedLink == null) continue;
+ } else{
+ continue;
+ }
+ }
+
+ if (result == null) result = new HashSet<T>();
+ target = linkResolver.resolve(unresolvedLink);
+ if (target != null) result.add((T) new Link(target));
+ }
+ }
+ }
+ }
+ result = extracted(base, result);
+ return result;
+ }
+
+ private boolean in(String target, String[] set){
+ for (String item : set){
+ if (item.equals(target)) return true;
+ }
+ return false;
+ }
+
+}
Added: incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/AbstractFetcher.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/AbstractFetcher.java?rev=810273&view=auto
==============================================================================
--- incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/AbstractFetcher.java (added)
+++ incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/AbstractFetcher.java Tue Sep 1 22:11:29 2009
@@ -0,0 +1,94 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.crawler.fetcher;
+
+import org.apache.droids.crawler.fetcher.delay.Delay;
+import org.apache.droids.crawler.Link;
+import org.apache.droids.crawler.filter.FetchFilter;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.beans.factory.annotation.Qualifier;
+
+import javax.annotation.PostConstruct;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Comparator;
+import java.util.Collections;
+
+/**
+ * @param <T>
+ */
+public abstract class AbstractFetcher<T extends Link> implements Fetcher<T>{
+ @Autowired(required = false) protected List<FetchFilter<T>> filters;
+ @Autowired(required = false) protected List<Delay<T>> delays;
+ @Autowired(required = false) @Qualifier("fetcher.filterComparator") protected Comparator filterComparator;
+
+ @PostConstruct public Fetcher<T> init(){
+ if (filters != null && filterComparator != null) Collections.sort(filters, filterComparator);
+ return this;
+ }
+
+ public void requestReady(T link){
+ if (getFilters() != null) for (FetchFilter<T> filter : getFilters()) filter.requestReady(link, this);
+ }
+
+ public Fetcher<T> fetched(T link){
+ Fetcher<T> fetcher = this;
+ if (getFilters() != null) for (FetchFilter<T> filter : getFilters()) fetcher = filter.fetched(link, this);
+ if (fetcher != null) link.setState(Link.State.FETCHED);
+ return fetcher;
+ }
+
+ public boolean addFilter(FetchFilter<T> filter){
+ if (filters == null) filters = new ArrayList<FetchFilter<T>>();
+ return filters.add(filter);
+ }
+
+ public boolean removeFilter(FetchFilter<T> filter){
+ return filters != null && filters.remove(filter);
+ }
+
+ public void setFilters(List<FetchFilter<T>> filters){
+ this.filters = filters;
+ }
+
+ public List<FetchFilter<T>> getFilters(){
+ return filters;
+ }
+
+ public boolean hasFilter(){
+ return filters != null && filters.size() > 0;
+ }
+
+ public Delay<T> delay(T link){
+ if (getDelays() == null){
+ for (Delay delay : getDelays()){
+ if (delay.matches(link)) return delay.delay();
+ }
+ }
+ return null;
+ }
+
+ public List<Delay<T>> getDelays(){
+ return delays;
+ }
+
+ public void setDelays(List<Delay<T>> delays){
+ this.delays = delays;
+ }
+
+
+}
Added: incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/DefaultFetcherFactory.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/DefaultFetcherFactory.java?rev=810273&view=auto
==============================================================================
--- incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/DefaultFetcherFactory.java (added)
+++ incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/DefaultFetcherFactory.java Tue Sep 1 22:11:29 2009
@@ -0,0 +1,93 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.crawler.fetcher;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.droids.crawler.Link;
+import org.apache.droids.crawler.util.WeightComparator;
+import org.apache.droids.crawler.util.ParamUtils;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.beans.factory.annotation.Qualifier;
+import org.springframework.context.ApplicationContext;
+import org.springframework.stereotype.Component;
+
+import javax.annotation.PostConstruct;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.List;
+import java.util.Map;
+
+@Component
+public class DefaultFetcherFactory<T extends Link> implements FetcherFactory<T>{
+ protected static Log log = LogFactory.getLog(DefaultFetcherFactory.class);
+ @Autowired protected ApplicationContext context;
+ @Autowired(required = false) protected List<Fetcher> fetchers;
+ @Autowired(required = false) @Qualifier("fetcher.comparator") protected Comparator fetcherComparator;
+
+ public DefaultFetcherFactory(){
+ }
+
+ public String toString(){
+ return super.toString() + " - fetchers: " + fetchers;// + ", httpClient: " + httpClient;
+ }
+
+ @PostConstruct
+ public void init(){
+ if (fetchers != null){
+ Comparator comparator = fetcherComparator != null ? fetcherComparator : new WeightComparator();
+ Collections.sort(fetchers, comparator);
+ }
+ if (log.isDebugEnabled()){
+ log.debug("initialized - fetchers: " + fetchers);
+ log.debug("\tfilters[0].filters: " + (fetchers != null && fetchers.size() > 0 ? fetchers.get(0).getFilters() : null));
+ }
+ }
+
+ public Fetcher<T> newFetcher(T link){
+ if (link == null) throw new IllegalArgumentException("link is null");
+ if (link.containsKey("fetcher"))
+ return (Fetcher<T>) ParamUtils.resolve(link, "fetcher", Fetcher.class, context);
+
+ for (Fetcher fetcher : fetchers){
+ if (fetcher.matches(link)){
+ String[] beanNames = context.getBeanNamesForType(fetcher.getClass());
+ if (beanNames == null || beanNames.length <= 0) return null;
+ return context.getBean(beanNames[0], Fetcher.class);
+ }
+ }
+
+ if (log.isInfoEnabled()) log.info("newFetcher() - cannot find a supported parser - link: " + link);
+ return null;
+ }
+
+ public List<Fetcher> getFetchers(){
+ return fetchers;
+ }
+
+ public void setFetchers(List<Fetcher> fetchers){
+ this.fetchers = fetchers;
+ }
+
+ public Comparator getFetcherComparator(){
+ return fetcherComparator;
+ }
+
+ public void setFetcherComparator(Comparator fetcherComparator){
+ this.fetcherComparator = fetcherComparator;
+ }
+}
Added: incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/Fetcher.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/Fetcher.java?rev=810273&view=auto
==============================================================================
--- incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/Fetcher.java (added)
+++ incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/Fetcher.java Tue Sep 1 22:11:29 2009
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.crawler.fetcher;
+
+import net.jcip.annotations.NotThreadSafe;
+import org.apache.droids.core.Entity;
+import org.apache.droids.crawler.Link;
+import org.apache.droids.crawler.fetcher.delay.Delay;
+import org.apache.droids.crawler.filter.FetchFilter;
+import org.apache.droids.crawler.filter.FilterSupport;
+import org.apache.droids.crawler.util.LinkMatcher;
+
+import java.io.Serializable;
+import java.util.Map;
+
+/**
+ * First step of the 3-step crawling process after a link is polled from the Queue ( fetch -> parse -> extract )
+ * <p/>
+ * Create by @FetcherFactory, only one Fetcher will be used for any Link. After fetched, statusCode and entity will be
+ * prepared (if available).
+ * <p/>
+ * Use a @FetchFilter if it is necessary to access the Http Request and Response.
+ */
+@NotThreadSafe
+public interface Fetcher<T extends Link> extends LinkMatcher<T>, FilterSupport<FetchFilter<T>>, Serializable{
+ static final int SUCCESS = 200;
+
+ /**
+ * @param params use null if there is no argument
+ */
+ Fetcher<T> fetch(T link, Map params) throws FetcherException;
+
+ int getStatusCode();
+
+ Entity getEntity();
+
+ /**
+ * if a delay is configured, the fetcher shall make the delay before fetch.
+ */
+ Delay<T> getDelay();
+
+ /**
+ * a fetcher must be reset before re-use
+ */
+ void reset();
+
+}
Added: incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/FetcherException.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/FetcherException.java?rev=810273&view=auto
==============================================================================
--- incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/FetcherException.java (added)
+++ incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/FetcherException.java Tue Sep 1 22:11:29 2009
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.crawler.fetcher;
+
+import org.apache.droids.crawler.CrawlerException;
+
+public class FetcherException extends CrawlerException {
+ public FetcherException() {
+ }
+
+ public FetcherException(String message) {
+ super(message);
+ }
+
+ public FetcherException(String message, Throwable cause) {
+ super(message, cause);
+ }
+
+ public FetcherException(Throwable cause) {
+ super(cause);
+ }
+}
Added: incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/FetcherFactory.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/FetcherFactory.java?rev=810273&view=auto
==============================================================================
--- incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/FetcherFactory.java (added)
+++ incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/FetcherFactory.java Tue Sep 1 22:11:29 2009
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.crawler.fetcher;
+
+import org.apache.droids.crawler.Link;
+
+public interface FetcherFactory<T extends Link>{
+ Fetcher<T> newFetcher(T link);
+}
Added: incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/HttpFetcher.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/HttpFetcher.java?rev=810273&view=auto
==============================================================================
--- incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/HttpFetcher.java (added)
+++ incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/HttpFetcher.java Tue Sep 1 22:11:29 2009
@@ -0,0 +1,151 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.crawler.fetcher;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.droids.core.Entity;
+import org.apache.droids.crawler.Link;
+import org.apache.droids.crawler.filter.FetchFilter;
+import org.apache.droids.crawler.fetcher.http.HttpHeaderSupport;
+import org.apache.droids.crawler.fetcher.http.CrawlerHttpClient;
+import org.apache.droids.crawler.fetcher.delay.Delay;
+import org.apache.droids.crawler.util.Weighted;
+import org.apache.http.HttpResponse;
+import org.apache.http.client.HttpClient;
+import org.apache.http.client.methods.HttpGet;
+import org.apache.http.client.methods.HttpUriRequest;
+import org.springframework.beans.factory.annotation.Autowired;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URI;
+import java.util.Map;
+
+import net.jcip.annotations.NotThreadSafe;
+
+import javax.annotation.PostConstruct;
+
+@NotThreadSafe
+public class HttpFetcher<T extends Link> extends AbstractFetcher<T> implements Fetcher<T>, HttpHeaderSupport, Weighted{
+ protected static Log log = LogFactory.getLog(HttpFetcher.class);
+ @Autowired(required = false) protected HttpClient httpClient;
+ protected HttpUriRequest request;
+ protected HttpResponse response;
+ protected Entity entity;
+ protected Delay delay;
+
+ public HttpFetcher(){
+ }
+
+ public HttpFetcher(HttpClient httpClient){
+ this.httpClient = httpClient;
+ }
+
+ @PostConstruct public HttpFetcher init(){
+ if (log.isDebugEnabled()) log.debug("init() - filters: " + filters + ", httpClient: " + httpClient);
+ return this;
+ }
+
+ public boolean matches(T link){
+ URI uri = link.getURI();
+ return "http".equalsIgnoreCase(uri.getScheme()) || "https".equalsIgnoreCase(uri.getScheme());
+ }
+
+
+ public Fetcher fetch(T link) throws FetcherException{
+ if (link.containsKey("fetch.params")) return fetch(link, (Map) link.get("fetch.params"));
+ else return fetch(link, null);
+ }
+
+ public Fetcher fetch(T link, Map params) throws FetcherException{
+ if (log.isTraceEnabled()) log.trace("to fetch - link: " + link + ", params: " + params);
+ try{
+ if (httpClient == null) httpClient = createHttpClient(link, params);
+ request = new HttpGet(link.getURI());
+ this.requestReady(link);
+ if (getDelay() != null) getDelay().delay();
+ response = httpClient.execute(request);
+ entity = new Entity(response.getEntity());
+ if (log.isDebugEnabled())
+ log.debug("fetch() - fetched - uri: " + link.getURI() + ", response: " + response);
+ return fetched(link);
+ } catch (IOException e){
+ throw new FetcherException(e);
+ }
+ }
+
+ /**
+ * Create a default HttpClient. It is recommended to inject a HttpClient instead of overriding this method.
+ */
+ protected HttpClient createHttpClient(T link, Map params){
+ if (params != null && params.containsKey("httpClient")){
+ return (HttpClient) params.get("httpClient");
+ } else{
+ return new CrawlerHttpClient();
+ }
+ }
+
+ public HttpResponse getResponse(){
+ return this.response != null ? this.response : null;
+ }
+
+ public int getStatusCode(){
+ return this.response != null ? this.response.getStatusLine().getStatusCode() : -1;
+ }
+
+ public InputStream getContent(){
+ return this.entity != null ? this.entity.getContent() : null;
+ }
+
+ public Entity getEntity(){
+ return this.entity;
+ }
+
+ public Delay<T> getDelay(){
+ return this.delay;
+ }
+
+
+ public void reset(){
+ this.entity = null;
+ this.request = null;
+ this.response = null;
+ }
+
+ public void addHttpHeader(String key, String value){
+ request.addHeader(key, value);
+ }
+
+ public void setHttpHeader(String key, String value){
+ request.setHeader(key, value);
+ }
+
+ public int getWeight(){
+ return 10;
+ }
+
+
+ public HttpClient getHttpClient(){
+ return httpClient;
+ }
+
+ public void setHttpClient(HttpClient httpClient){
+ this.httpClient = httpClient;
+ }
+
+}
Added: incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/appengine/AppEngineFetcher.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/appengine/AppEngineFetcher.java?rev=810273&view=auto
==============================================================================
--- incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/appengine/AppEngineFetcher.java (added)
+++ incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/appengine/AppEngineFetcher.java Tue Sep 1 22:11:29 2009
@@ -0,0 +1,230 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.crawler.fetcher.appengine;
+
+import com.google.appengine.api.urlfetch.*;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.droids.core.Entity;
+import org.apache.droids.crawler.CrawlerService;
+import org.apache.droids.crawler.Link;
+import org.apache.droids.crawler.fetcher.AbstractFetcher;
+import org.apache.droids.crawler.fetcher.Fetcher;
+import org.apache.droids.crawler.fetcher.FetcherException;
+import org.apache.droids.crawler.fetcher.delay.Delay;
+import org.apache.droids.crawler.fetcher.http.HttpHeaderSupport;
+import org.apache.droids.crawler.filter.FetchFilter;
+
+import java.io.InputStream;
+import java.net.URI;
+import java.net.URL;
+import java.util.HashMap;
+import java.util.Map;
+
+/**
+ * Single thread use Fetcher for Google App Engine.
+ * <p/>
+ * TODO: find a way to use stream fetching in GAE
+ * the current implementation uses URLFetchService that fetch data as a byte[]. Given that there is a limitation
+ * in GAE that can process data up to 1M, there is a need to look for a method to do streaming
+ */
+public class AppEngineFetcher<T extends Link> extends AbstractFetcher<T> implements Fetcher<T>, HttpHeaderSupport{
+ protected static Log log = LogFactory.getLog(AppEngineFetcher.class);
+ public static final long serialVersionUID = CrawlerService.DEFAULT_SERIALVERSIONID;
+ protected transient URLFetchService urlFetchService;
+ protected transient HTTPRequest request;
+ protected transient HTTPResponse response;
+ protected Entity entity;
+ protected static Map<String, String> requestHeaders = new HashMap<String, String>();
+ protected int statusCode;
+ protected int autoRetries = 5;//total 5 times
+ protected int retryDelay = 5000;
+
+ static{
+ requestHeaders.put("User-Agent", "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.1.1) Gecko/20090715 Firefox/3.5.1 GTB5");
+ requestHeaders.put("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,**/*//*;q=0.8");
+ requestHeaders.put("Accept-Language", "en-us,en;q=0.5");
+ requestHeaders.put("Accept-Encoding", "gzip,deflate");
+ requestHeaders.put("Accept-Charset", "ISO-8859-1,utf-8;q=0.7,*;q=0.7");
+ requestHeaders.put("Keep-Alive", "300");
+ requestHeaders.put("Connection", "keep-alive");
+ requestHeaders.put("Pragma", "no-cache");
+ requestHeaders.put("Cache-Control", "no-cache");
+ }
+
+ /*protected transient HTTPRequest request;
+ protected transient HTTPResponse response;*/
+ protected Delay delay;
+
+ public boolean matches(T link){
+ if (link == null) return false;
+ URI uri = link.getURI();
+ return "http".equalsIgnoreCase(uri.getScheme()) || "https".equalsIgnoreCase(uri.getScheme());
+ }
+
+ public Fetcher fetch(T link) throws FetcherException{
+ if (link == null) return null;
+ if (link.containsKey("fetch.params")) return fetch(link, (Map) link.get("fetch.params"));
+ else return fetch(link, null);
+ }
+
+ public Fetcher fetch(T link, Map params) throws FetcherException{
+ if (link == null) return null;
+
+ if (urlFetchService == null) urlFetchService = URLFetchServiceFactory.getURLFetchService();
+
+ for (int i = 0; i <= autoRetries; i++){
+ if (log.isTraceEnabled()) log.trace("fetch() - link: " + link + ", params: " + params + ", i: " + i);
+ try{
+ URL url = link.getURI().toURL();
+ request = new HTTPRequest(url);
+ request.getFetchOptions().followRedirects().allowTruncate();
+ for (String header : requestHeaders.keySet()){
+ request.addHeader(new HTTPHeader(header, requestHeaders.get(header)));
+ }
+ if (getFilters() != null) for (FetchFilter filter : getFilters()) filter.requestReady(link, this);
+
+ if (getDelay() != null) getDelay().delay();
+ response = urlFetchService.fetch(request);
+
+ if (log.isTraceEnabled())
+ log.trace("fetched() - fetched, response.responseCode: " + response.getResponseCode() + ", response: " + response);
+
+ this.statusCode = response.getResponseCode();
+ this.entity = new Entity(response.getContent());
+
+ if (log.isTraceEnabled()){
+ log.trace("\turl: " + url + ", headers.size(): " + response.getHeaders().size());
+ }
+
+ for (HTTPHeader header : response.getHeaders()){
+ if (log.isTraceEnabled()){
+ log.trace("\t\tname: " + header.getName() + ", value: " + header.getValue());
+ }
+
+ if (header.getName().equals("Content-Type")){
+ this.entity.setContentType(header.getValue());
+ }
+ }
+
+ return fetched(link);
+ } catch (Exception e){
+ log.error("error in fetch() - e.message: " + e.getMessage() + ", i (retry index): " + i + ", link: " + link + ", request: " + request + ", request.headers: " + (request != null ? request.getHeaders() : "null"));
+ if (i == autoRetries){throw new FetcherException(e);} else{
+ try{
+ if (log.isTraceEnabled()) log.trace("to sleep and retry - retryDelay: " + retryDelay + "ms");
+ Thread.sleep(retryDelay);
+ } catch (InterruptedException e1){
+ }
+ }
+ }
+ }
+ return null;
+ /*
+if (urlFetchService == null) urlFetchService = URLFetchServiceFactory.getURLFetchService();
+try{
+ request = new HTTPRequest(link.getURI().toURL()); //TODO add default parameter
+ if (getFilters() != null) for (FetchFilter filter : getFilters()) filter.requestReady(link, this);
+
+ if (getDelay() != null) getDelay().delay();
+ response = urlFetchService.fetch(request);
+ entity = new Entity(response.getContent());
+
+ for (HTTPHeader header : response.getHeaders()){
+ if ("Content-Type".equalsIgnoreCase(header.getName())){
+ String contentTypeHeader = header.getValue();
+ if (contentTypeHeader.indexOf(';') != -1){
+ entity.setCharset(contentTypeHeader.substring(contentTypeHeader.indexOf(';'), contentTypeHeader.length()).trim());
+ entity.setContentType(contentTypeHeader.substring(0, contentTypeHeader.indexOf(';')));
+ } else{
+ entity.setContentType(contentTypeHeader);
+ }
+ }
+ *//*else if ("Content-Length".equalsIgnoreCase(header.getName())){
+ entity.setContentLength(Long.parseLong(header.getValue().trim()));
+ }*//*
+ }
+
+ if (getFilters() != null) for (FetchFilter filter : getFilters()) filter.fetched(link, this);
+ if (log.isTraceEnabled())
+ log.trace("fetched - entity: " + entity + ", link: " + link + ", filters: " + getFilters());
+ return this;
+ } catch (IOException e){
+ log.error("fetch() - error - link: " + link + ", params: " + params + ", exception: " + e);
+ throw new FetcherException(e);
+ }*/
+ }
+
+ public int getStatusCode(){
+ return this.statusCode;
+ }
+
+
+ public void reset(){
+ this.entity = null;
+ /*this.request = null;
+ this.response = null;*/
+ }
+
+ public void addHttpHeader(String key, String value){
+ //request.addHeader(new HTTPHeader(key, value));
+ }
+
+ public void setHttpHeader(String key, String value){
+ //request.setHeader(new HTTPHeader(key, value));
+ }
+
+ public InputStream getContent() throws FetcherException{
+ return entity.getContent();
+ }
+
+ /*public int getStatusCode(){
+ return response.getResponseCode();
+ }*/
+
+ public Entity getEntity(){
+ return entity;
+ }
+
+ public Delay<T> getDelay(){
+ return this.delay;
+ }
+
+ /*public URLFetchService getUrlFetchService(){
+ return urlFetchService;
+ }
+
+ public void setUrlFetchService(URLFetchService urlFetchService){
+ this.urlFetchService = urlFetchService;
+ }
+
+ public HTTPRequest getRequest(){
+ return request;
+ }
+
+ public void setRequest(HTTPRequest request){
+ this.request = request;
+ }
+
+ public HTTPResponse getResponse(){
+ return response;
+ }
+
+ public void setResponse(HTTPResponse response){
+ this.response = response;
+ }*/
+}
Added: incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/delay/Delay.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/delay/Delay.java?rev=810273&view=auto
==============================================================================
--- incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/delay/Delay.java (added)
+++ incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/delay/Delay.java Tue Sep 1 22:11:29 2009
@@ -0,0 +1,28 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.crawler.fetcher.delay;
+
+import org.apache.droids.crawler.Link;
+import org.apache.droids.crawler.util.LinkMatcher;
+
+public interface Delay<T extends Link> extends LinkMatcher<T>{
+
+ /**
+ * Delay the current thread
+ */
+ public Delay<T> delay();
+}
Added: incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/http/CrawlerHttpClient.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/http/CrawlerHttpClient.java?rev=810273&view=auto
==============================================================================
--- incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/http/CrawlerHttpClient.java (added)
+++ incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/http/CrawlerHttpClient.java Tue Sep 1 22:11:29 2009
@@ -0,0 +1,140 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.crawler.fetcher.http;
+
+import org.apache.http.HttpVersion;
+import org.apache.http.HttpResponseInterceptor;
+import org.apache.http.HttpResponse;
+import org.apache.http.HttpException;
+import org.apache.http.entity.BufferedHttpEntity;
+import org.apache.http.conn.ClientConnectionManager;
+import org.apache.http.conn.params.ConnManagerParams;
+import org.apache.http.conn.params.ConnPerRouteBean;
+import org.apache.http.conn.scheme.PlainSocketFactory;
+import org.apache.http.conn.scheme.Scheme;
+import org.apache.http.conn.scheme.SchemeRegistry;
+import org.apache.http.conn.ssl.SSLSocketFactory;
+import org.apache.http.impl.client.DefaultHttpClient;
+import org.apache.http.impl.conn.tsccm.ThreadSafeClientConnManager;
+import org.apache.http.params.BasicHttpParams;
+import org.apache.http.params.HttpConnectionParams;
+import org.apache.http.params.HttpParams;
+import org.apache.http.params.HttpProtocolParams;
+import org.apache.http.protocol.HTTP;
+import org.apache.http.protocol.HttpContext;
+
+import javax.annotation.PostConstruct;
+import java.io.IOException;
+
+public class CrawlerHttpClient extends DefaultHttpClient{
+ protected int connectionTimeout = 60000;
+ protected String userAgent = "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.1.1) Gecko/20090715 Firefox/3.0.10";
+ //Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US; rv:1.9.1.1) Gecko/20090715 Firefox/3.5.1
+ protected int maxTotalConnections = 2;
+ protected int maxConnectionsPerRoute = 2;
+ protected boolean useBuffer = true;
+
+ public CrawlerHttpClient(int conn){
+ this.maxTotalConnections = conn;
+ }
+
+ public CrawlerHttpClient(ClientConnectionManager clientConnectionManager, HttpParams httpParams){
+ super(clientConnectionManager, httpParams);
+ }
+
+ public CrawlerHttpClient(HttpParams httpParams){
+ super(httpParams);
+ }
+
+ public CrawlerHttpClient(){
+
+ }
+
+ @PostConstruct
+ public CrawlerHttpClient init(){
+ if (useBuffer){
+ this.addResponseInterceptor(new HttpResponseInterceptor(){
+ public void process(HttpResponse httpResponse, HttpContext httpContext) throws HttpException, IOException{
+ httpResponse.setEntity(new BufferedHttpEntity(httpResponse.getEntity()));
+ }
+ });
+ }
+ return this;
+ }
+
+ public String toString(){
+ return super.toString() + " - maxTotalConnections: " + maxTotalConnections +
+ ", maxConnectionsPerRoute: " + maxConnectionsPerRoute + ", connectionTimeout: " + connectionTimeout +
+ ", userAgent: " + userAgent;
+ }
+
+
+ @Override protected HttpParams createHttpParams(){
+ HttpParams params = new BasicHttpParams();
+ ConnManagerParams.setMaxTotalConnections(params, maxTotalConnections);
+ HttpProtocolParams.setVersion(params, HttpVersion.HTTP_1_1);
+ HttpProtocolParams.setUserAgent(params, userAgent);
+ HttpProtocolParams.setContentCharset(params, HTTP.DEFAULT_CONTENT_CHARSET);
+ HttpProtocolParams.setUseExpectContinue(params, true);
+ HttpConnectionParams.setConnectionTimeout(params, connectionTimeout);
+ HttpConnectionParams.setTcpNoDelay(params, false);
+ HttpConnectionParams.setStaleCheckingEnabled(params, false);
+ HttpConnectionParams.setSocketBufferSize(params, 8192);
+ ConnManagerParams.setMaxConnectionsPerRoute(params, new ConnPerRouteBean(maxConnectionsPerRoute));
+ ConnManagerParams.setTimeout(params, 0x493e0L);
+ return params;
+ }
+
+ @Override protected ClientConnectionManager createClientConnectionManager(){
+ SchemeRegistry schemeRegistry = new SchemeRegistry();
+ schemeRegistry.register(new Scheme("http", PlainSocketFactory.getSocketFactory(), 80));
+ schemeRegistry.register(new Scheme("https", SSLSocketFactory.getSocketFactory(), 443));
+ return new ThreadSafeClientConnManager(getParams(), schemeRegistry);
+ }
+
+ public int getConnectionTimeout(){
+ return connectionTimeout;
+ }
+
+ public void setConnectionTimeout(int connectionTimeout){
+ this.connectionTimeout = connectionTimeout;
+ }
+
+ public String getUserAgent(){
+ return userAgent;
+ }
+
+ public void setUserAgent(String userAgent){
+ this.userAgent = userAgent;
+ }
+
+ public int getMaxTotalConnections(){
+ return maxTotalConnections;
+ }
+
+ public void setMaxTotalConnections(int maxTotalConnections){
+ this.maxTotalConnections = maxTotalConnections;
+ }
+
+ public int getMaxConnectionsPerRoute(){
+ return maxConnectionsPerRoute;
+ }
+
+ public void setMaxConnectionsPerRoute(int maxConnectionsPerRoute){
+ this.maxConnectionsPerRoute = maxConnectionsPerRoute;
+ }
+}
Added: incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/http/HttpHeaderSupport.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/http/HttpHeaderSupport.java?rev=810273&view=auto
==============================================================================
--- incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/http/HttpHeaderSupport.java (added)
+++ incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/http/HttpHeaderSupport.java Tue Sep 1 22:11:29 2009
@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.crawler.fetcher.http;
+
+public interface HttpHeaderSupport {
+
+ /**
+ * Notice: it works only in preFetch event of FetchFilter
+ */
+ void addHttpHeader(String key, String value);
+
+ /**
+ * Notice: it works only in preFetch event of FetchFilter
+ */
+ void setHttpHeader(String key, String value);
+}
Added: incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/filter/DepthFilter.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/filter/DepthFilter.java?rev=810273&view=auto
==============================================================================
--- incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/filter/DepthFilter.java (added)
+++ incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/filter/DepthFilter.java Tue Sep 1 22:11:29 2009
@@ -0,0 +1,118 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.crawler.filter;
+
+import org.apache.droids.crawler.Link;
+import org.apache.droids.crawler.filter.LinkFilter;
+import org.apache.droids.crawler.extractor.Extractor;
+import org.apache.droids.crawler.extractor.ExtractFilter;
+import org.apache.droids.crawler.fetcher.Fetcher;
+import org.apache.droids.crawler.parser.Parser;
+
+import java.util.Set;
+import java.util.HashSet;
+import java.util.List;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+public class DepthFilter<T extends Link> implements LinkFilter<T>, ExtractFilter<T>{
+ protected static Log log = LogFactory.getLog(DepthFilter.class);
+ public static final int DEFAULT_DEPTH = 0;
+ protected int maxDepth;
+ protected AtomicInteger removeCounter = new AtomicInteger();
+
+ public DepthFilter(){
+ this.maxDepth = DEFAULT_DEPTH;
+ }
+
+ public DepthFilter(int maxDepth){
+ this.maxDepth = maxDepth;
+ }
+
+ public String toString(){
+ return super.toString() + " - maxDepth: " + maxDepth;
+ }
+
+
+ /**
+ * add "depth" data to any polled link without "depth"
+ */
+ public T polled(T link){
+ if (!link.containsKey("depth")){
+ link.put("depth", 0);
+ if (log.isTraceEnabled()) log.trace("polled() - added depth - link: " + link);
+ }
+ return link;
+ }
+
+
+ public Set<T> extracted(T base, Extractor<T, ? extends Parser> extractor, Set<T> links){
+ if (links == null) return null;
+ if (log.isTraceEnabled())
+ log.trace("maxDepth: " + maxDepth + ", base: " + base + ", links.size(): " + links.size());
+ Set<T> removeList = new HashSet<T>();
+ for (T link : links){
+ if (!link.containsKey("depth")){
+ int depth = base.containsKey("depth") ? (Integer) base.get("depth") + 1 : 1;
+ link.put("depth", depth);
+ //if (log.isTraceEnabled()) log.trace("marked - depth: " + depth + ", link: " + link);
+ }
+
+ if ((Integer) link.get("depth") > maxDepth){
+ if (log.isTraceEnabled()) log.trace("rejecting - maxDepth: " + maxDepth + ", link: " + link);
+ removeList.add(link);
+ }
+ }
+
+ if (removeList.size() > 0){
+ removeCounter.addAndGet(removeList.size());
+ int linkSize = links.size();
+ links.removeAll(removeList);
+ if (log.isDebugEnabled())
+ log.debug("rejected " + removeList.size() + " link(s) - links.size(): " + linkSize + " -> " + links.size());
+ }
+
+ return links;
+ }
+
+ public Set<T> extractedAll(T base, List<Extractor<T, ? extends Parser>> extractor, Set<T> links){
+ if (log.isDebugEnabled()){
+ for (T link : links){
+ if (!link.containsKey("depth")){
+ log.warn("extractedAll() - extracted link without 'weight' attribute - link: " + link);
+ }
+ }
+ }
+ return links;
+ }
+
+ public void completed(T link, Set<T> links){ }
+
+ public void failed(T link, Object object){ }
+
+ public int getMaxDepth(){
+ return maxDepth;
+ }
+
+ public void setMaxDepth(int maxDepth){
+ this.maxDepth = maxDepth;
+ }
+
+
+}
Added: incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/filter/FetchFilter.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/filter/FetchFilter.java?rev=810273&view=auto
==============================================================================
--- incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/filter/FetchFilter.java (added)
+++ incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/filter/FetchFilter.java Tue Sep 1 22:11:29 2009
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.crawler.filter;
+
+import org.apache.droids.crawler.Link;
+import org.apache.droids.crawler.fetcher.Fetcher;
+import org.apache.droids.crawler.filter.Filter;
+
+/**
+ * Implementation is expected to cast the fetcher to its implementation type and access it for processing.
+ * e.g. read the fetcher's http request or response data, and set any data to the Link
+ */
+public interface FetchFilter<T extends Link> extends Filter<T>{
+
+ /**
+ * After a request is prepared and before the content is fetched, and also before delay is applied
+ */
+ void requestReady(T link, Fetcher<T> fetcher);
+
+ /**
+ * filter the Link and result processed by a fetcher. implentation may, for example, wrap the Fetcher that provide
+ * a different getContent() so the parser may process the fetched content differently.
+ */
+ Fetcher<T> fetched(T link, Fetcher<T> fetcher);
+}
Added: incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/filter/Filter.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/filter/Filter.java?rev=810273&view=auto
==============================================================================
--- incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/filter/Filter.java (added)
+++ incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/filter/Filter.java Tue Sep 1 22:11:29 2009
@@ -0,0 +1,27 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.crawler.filter;
+
+import java.io.Serializable;
+
+import org.apache.droids.crawler.Link;
+
+/**
+ * This is the root of all of filters
+ */
+public interface Filter<T extends Link> extends Serializable{
+}
Added: incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/filter/FilterSupport.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/filter/FilterSupport.java?rev=810273&view=auto
==============================================================================
--- incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/filter/FilterSupport.java (added)
+++ incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/filter/FilterSupport.java Tue Sep 1 22:11:29 2009
@@ -0,0 +1,35 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.crawler.filter;
+
+import java.util.List;
+
+public interface FilterSupport<T> {
+
+ boolean addFilter(T filter);
+
+ boolean removeFilter(T filter);
+
+ /**
+ * Filter are applied in order
+ */
+ void setFilters(List<T> filters);
+
+ List<T> getFilters();
+
+ boolean hasFilter();
+}
Added: incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/filter/LinkFilter.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/filter/LinkFilter.java?rev=810273&view=auto
==============================================================================
--- incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/filter/LinkFilter.java (added)
+++ incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/filter/LinkFilter.java Tue Sep 1 22:11:29 2009
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.crawler.filter;
+
+import org.apache.droids.crawler.Link;
+import org.apache.droids.crawler.extractor.Extractor;
+import org.apache.droids.crawler.fetcher.Fetcher;
+import org.apache.droids.crawler.parser.Parser;
+
+import java.util.List;
+import java.util.Set;
+
+/**
+ * Filter, a.k.a. CrawlerFilter, may be used in different ways as follows:
+ * - as a classifier to append custom information to a Link that will be passed through in the whole process chain, and
+ * be replicated in a cluster environment
+ * - to stop the chain of processing. If any filter returns null, the Crawler/Droid/TaskMaster shall stop futher
+ * processing of the Link
+ * - to alter any data in the processing. e.g. in fetched(), the fetcher stores the fetched HttpEntity. A filter could
+ * alter the HttpEntity and
+ */
+public interface LinkFilter<T extends Link> extends Filter<T>{
+ //TODO consider to change to Enum
+ public static final int POLLED = 1, FETCHED = 2, PARSED = 3, EXTRACTED = 4, COMPLETED = 5, FAILED = -1;
+
+ /**
+ * filter the Link from the master Queue. The crawler/droid/taskmaster shall honer the filters and provide the
+ * returned link to obtain and feed to a filter.
+ */
+ public T polled(T link);
+
+ /**
+ public Fetcher<T> fetched(T link, Fetcher<T> fetcher);
+ public Parser<T> parsed(T link, Parser<T> parser);
+ public Set<T> extracted(T link, Extractor<T> extractor, Set<T> links);
+ public Set<T> extractedAll(T link, Set<T> links);
+ */
+
+ /**
+ * Completed processing of a link. Any threadlocal resource could be released.
+ */
+ public void completed(T link, final Set<T> links);
+
+ /**
+ * @param link
+ * @param object
+ */
+ public void failed(T link, Object object);
+}
Added: incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/filter/ParseFilter.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/filter/ParseFilter.java?rev=810273&view=auto
==============================================================================
--- incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/filter/ParseFilter.java (added)
+++ incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/filter/ParseFilter.java Tue Sep 1 22:11:29 2009
@@ -0,0 +1,16 @@
+package org.apache.droids.crawler.filter;
+
+import org.apache.droids.crawler.Link;
+import org.apache.droids.crawler.parser.Parser;
+
+public interface ParseFilter<T extends Link, D> extends Filter<T>{
+
+ /**
+ * This provide an inteception point right before the parse() operation. The Parser and any underlying parsers or
+ * supporting utilities shall all be ready. Implementation should cast the parser to its implementation and access
+ * its internal variables for operation.
+ */
+ void parserReady(T link, Parser<T, D> parser);
+
+ Parser<T, D> parsed(T link, Parser<T, D> parser);
+}
Added: incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/filter/StateFilter.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/filter/StateFilter.java?rev=810273&view=auto
==============================================================================
--- incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/filter/StateFilter.java (added)
+++ incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/filter/StateFilter.java Tue Sep 1 22:11:29 2009
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.crawler.filter;
+
+import org.apache.droids.crawler.Link;
+import org.apache.droids.crawler.parser.Parser;
+import org.apache.droids.crawler.fetcher.Fetcher;
+import org.apache.droids.crawler.util.Weighted;
+import org.apache.droids.crawler.extractor.ExtractFilter;
+import org.apache.droids.crawler.extractor.Extractor;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+import java.util.Set;
+import java.util.List;
+
+public class StateFilter<T extends Link> implements LinkFilter<T>, FetchFilter<T>, ParseFilter<T, Object>, ExtractFilter<T>, Weighted{
+ protected static Log log = LogFactory.getLog(StateFilter.class);
+
+
+ public int getWeight(){
+ return 1000;
+ }
+
+ public T polled(T link){
+ link.setState(Link.State.POLLED);
+ if (log.isDebugEnabled()) log.debug("polled() - set link to POLLED - link: " + link);
+ return link;
+ }
+
+
+ public void requestReady(T link, Fetcher<T> fetcher){
+ }
+
+ public Fetcher<T> fetched(T link, Fetcher<T> fetcher){
+ link.setState(Link.State.FETCHED);
+ return fetcher;
+ }
+
+ public void parserReady(T link, Parser<T, Object> parser){
+ }
+
+ public Parser<T, Object> parsed(T link, Parser<T, Object> parser){
+ link.setState(Link.State.PARSED);
+ if (log.isDebugEnabled()) log.debug("parsed() - set link to PARSED - link: " + link);
+ return parser;
+ }
+
+ public Set<T> extracted(T base, Extractor<T, ? extends Parser> tExtractor, Set<T> links){
+ return links;
+ }
+
+ public Set<T> extractedAll(T base, List<Extractor<T, ? extends Parser>> extractors, Set<T> links){
+ base.setState(Link.State.EXTRACTED);
+ return links;
+ }
+
+ public void completed(T link, Set<T> links){
+ link.setState(Link.State.COMPLETED);
+ }
+
+ public void failed(T link, Object object){
+ //To change body of implemented methods use File | Settings | File Templates.
+ }
+
+
+}