You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by th...@apache.org on 2016/07/16 19:48:46 UTC

[30/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build for nutch-core and nutch-plugins

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/pages/seed/SeedPage.html
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/pages/seed/SeedPage.html b/nutch-core/src/main/java/org/apache/nutch/webui/pages/seed/SeedPage.html
new file mode 100644
index 0000000..d6f33a3
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/pages/seed/SeedPage.html
@@ -0,0 +1,91 @@
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
+	license agreements. See the NOTICE file distributed with this work for additional 
+	information regarding copyright ownership. The ASF licenses this file to 
+	You under the Apache License, Version 2.0 (the "License"); you may not use 
+	this file except in compliance with the License. You may obtain a copy of 
+	the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
+	by applicable law or agreed to in writing, software distributed under the 
+	License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
+	OF ANY KIND, either express or implied. See the License for the specific 
+	language governing permissions and limitations under the License. -->
+<!DOCTYPE html>
+<html xmlns:wicket="http://wicket.apache.org">
+<head>
+<meta charset="utf-8" />
+<title>Wicket extend</title>
+</head>
+
+<body>
+	<wicket:extend>
+		<h2>
+			<wicket:message key="page.header.seedList">Seed list</wicket:message>
+		</h2>
+
+		<div class="row">
+			<div class="col-lg-8">
+				<form class="form-horizontal" wicket:id="seedList">
+					<fieldset>
+						<!-- Text input-->
+						<div class="form-group">
+							<label class="col-md-4 control-label" for="textinput">Seed list name</label>
+							<div class="col-md-4">
+								<input wicket:id="name" name="textinput" class="form-control input-md" type="text">
+							</div>
+						</div>
+						<div class="form-group">
+							<div class="col-md-offset-4 col-md-4">
+								<button type="submit" class="btn btn-primary">Save</button>
+							</div>
+						</div>
+					</fieldset>
+				</form>
+				<h3>Seed urls</h3>
+				<table class="table table-hover table-striped tablesorter">
+					<thead>
+						<tr>
+							<th class="header col-md-3">Url</th>
+							<th></th>
+						</tr>
+					</thead>
+
+					<tbody wicket:id="seedUrlsTable">
+						<tr wicket:id="seedUrls">
+							<td>
+								<span wicket:id="url">http://google.com</span>
+							</td>
+							<td>
+								<button wicket:id="delete" class="btn btn-sm btn-danger" type="button">
+									<span class="fa fa-trash-o"></span>
+								</button>
+							</td>
+						</tr>
+					</tbody>
+				</table>
+				<form class="form-horizontal" wicket:id="urlForm">
+					<fieldset>
+						<div class="form-group">
+							<div class="col-md-4">
+								<input wicket:id="url" name="textinput" class="form-control input-md" type="text">
+							</div>
+							<div>
+								<button wicket:id="addUrl" class="btn btn-primary">Add url</button>
+							</div>
+						</div>
+					</fieldset>
+				</form>
+			</div>
+			<div class="col-lg-4">
+				<div class="panel panel-primary">
+					<div class="panel-heading">
+						<h3 class="panel-title">Help</h3>
+					</div>
+					<div class="panel-body">
+						<p>Some help about seed management</p>
+					</div>
+				</div>
+			</div>
+		</div>
+		<!--row-->
+	</wicket:extend>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/pages/seed/SeedPage.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/pages/seed/SeedPage.java b/nutch-core/src/main/java/org/apache/nutch/webui/pages/seed/SeedPage.java
new file mode 100644
index 0000000..fba07ab
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/pages/seed/SeedPage.java
@@ -0,0 +1,153 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui.pages.seed;
+
+import java.util.Iterator;
+
+import org.apache.nutch.webui.model.SeedList;
+import org.apache.nutch.webui.model.SeedUrl;
+import org.apache.nutch.webui.pages.AbstractBasePage;
+import org.apache.nutch.webui.pages.components.CpmIteratorAdapter;
+import org.apache.nutch.webui.service.SeedListService;
+import org.apache.wicket.ajax.AjaxRequestTarget;
+import org.apache.wicket.ajax.markup.html.AjaxLink;
+import org.apache.wicket.ajax.markup.html.form.AjaxSubmitLink;
+import org.apache.wicket.markup.html.WebMarkupContainer;
+import org.apache.wicket.markup.html.basic.Label;
+import org.apache.wicket.markup.html.form.Form;
+import org.apache.wicket.markup.html.form.TextField;
+import org.apache.wicket.markup.repeater.Item;
+import org.apache.wicket.markup.repeater.RefreshingView;
+import org.apache.wicket.model.CompoundPropertyModel;
+import org.apache.wicket.model.IModel;
+import org.apache.wicket.model.LoadableDetachableModel;
+import org.apache.wicket.model.Model;
+import org.apache.wicket.request.mapper.parameter.PageParameters;
+import org.apache.wicket.spring.injection.annot.SpringBean;
+
+import com.google.common.collect.Lists;
+
+/**
+ * This page is for seed urls management
+ * 
+ * @author feodor
+ * 
+ */
+public class SeedPage extends AbstractBasePage<SeedList> {
+
+  @SpringBean
+  private SeedListService seedListService;
+
+  private Form<SeedUrl> urlForm;
+
+  private WebMarkupContainer seedUrlsTable;
+
+  public SeedPage() {
+    SeedList list = new SeedList();
+    list.setSeedUrls(Lists.<SeedUrl> newArrayList());
+    initPage(Model.of(list));
+  }
+
+  public SeedPage(final PageParameters parameters) {
+    initPage(new LoadableDetachableModel<SeedList>() {
+
+      @Override
+      protected SeedList load() {
+        Long seedListId = parameters.get("id").toLongObject();
+        return seedListService.getSeedList(seedListId);
+      }
+    });
+  }
+
+  public void initPage(IModel<SeedList> model) {
+    setModel(new CompoundPropertyModel<SeedList>(model));
+
+    addBaseForm();
+    addSeedUrlsList();
+    addUrlForm();
+  }
+
+  private void addBaseForm() {
+    Form<SeedList> form = new Form<SeedList>("seedList", getModel()) {
+      @Override
+      protected void onSubmit() {
+        seedListService.save(getModelObject());
+        setResponsePage(SeedListsPage.class);
+      }
+    };
+    form.add(new TextField<String>("name"));
+    add(form);
+  }
+
+  private void addSeedUrlsList() {
+    seedUrlsTable = new WebMarkupContainer("seedUrlsTable");
+    seedUrlsTable.setOutputMarkupId(true);
+
+    RefreshingView<SeedUrl> seedUrls = new RefreshingView<SeedUrl>("seedUrls") {
+
+      @Override
+      protected Iterator<IModel<SeedUrl>> getItemModels() {
+        return new CpmIteratorAdapter<SeedUrl>(getModelObject().getSeedUrls());
+      }
+
+      @Override
+      protected void populateItem(Item<SeedUrl> item) {
+        item.add(new Label("url"));
+        item.add(new AjaxLink<SeedUrl>("delete", item.getModel()) {
+
+          @Override
+          public void onClick(AjaxRequestTarget target) {
+            deleteSeedUrl(getModelObject());
+            target.add(seedUrlsTable);
+          }
+        });
+      }
+    };
+    seedUrlsTable.add(seedUrls);
+    add(seedUrlsTable);
+  }
+
+  private void addUrlForm() {
+    urlForm = new Form<SeedUrl>("urlForm", CompoundPropertyModel.of(Model
+        .of(new SeedUrl())));
+    urlForm.setOutputMarkupId(true);
+    urlForm.add(new TextField<String>("url"));
+    urlForm.add(new AjaxSubmitLink("addUrl", urlForm) {
+      @Override
+      protected void onSubmit(AjaxRequestTarget target, Form<?> form) {
+        addSeedUrl();
+        urlForm.setModelObject(new SeedUrl());
+        target.add(urlForm);
+        target.add(seedUrlsTable);
+      }
+    });
+    add(urlForm);
+  }
+
+  private void addSeedUrl() {
+    SeedUrl url = urlForm.getModelObject();
+    SeedList seedList = getModelObject();
+    url.setSeedList(seedList);
+    seedList.getSeedUrls().add(url);
+  }
+
+  private void deleteSeedUrl(SeedUrl url) {
+    SeedList seedList = getModelObject();
+    seedList.getSeedUrls().remove(url);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/pages/settings/SettingsPage.html
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/pages/settings/SettingsPage.html b/nutch-core/src/main/java/org/apache/nutch/webui/pages/settings/SettingsPage.html
new file mode 100644
index 0000000..8810371
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/pages/settings/SettingsPage.html
@@ -0,0 +1,43 @@
+<!DOCTYPE html>
+<html xmlns:wicket="http://wicket.apache.org">
+<head>
+<meta charset="utf-8" />
+<title>Wicket extend</title>
+</head>
+
+<body>
+	<wicket:extend>
+		<h2>
+			<wicket:message key="settings">Settings</wicket:message>
+		</h2>
+		<div class="row">
+			<div class="col-lg-12">
+				<table class="table table-hover tablesorter table-bordered">
+					<thead>
+						<tr>
+							<th class="header col-lg-3">
+								<wicket:message key="settings.header.name">Name</wicket:message>
+							</th>
+							<th class="header col-lg-9">
+								<wicket:message key="settings.header.value">Value</wicket:message>
+							</th>
+						</tr>
+					</thead>
+					<tbody wicket:id="settingsTable">
+						<tr wicket:id="settings">
+							<td>
+								<span wicket:id="name">Name</span>
+							</td>
+							<td>
+<!-- 								<span wicket:id="value">Value</span> -->
+					<input class="col-lg-12" wicket:id="value" placeholder="http://localhost:8080">
+							</td>
+						</tr>
+					</tbody>
+				</table>
+			</div>
+		</div>
+	</wicket:extend>
+
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/pages/settings/SettingsPage.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/pages/settings/SettingsPage.java b/nutch-core/src/main/java/org/apache/nutch/webui/pages/settings/SettingsPage.java
new file mode 100644
index 0000000..29e46f7
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/pages/settings/SettingsPage.java
@@ -0,0 +1,59 @@
+package org.apache.nutch.webui.pages.settings;
+
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.nutch.webui.model.NutchConfig;
+import org.apache.nutch.webui.pages.AbstractBasePage;
+import org.apache.nutch.webui.pages.components.CpmIteratorAdapter;
+import org.apache.nutch.webui.service.NutchService;
+import org.apache.wicket.markup.html.WebMarkupContainer;
+import org.apache.wicket.markup.html.basic.Label;
+import org.apache.wicket.markup.html.form.TextField;
+import org.apache.wicket.markup.repeater.Item;
+import org.apache.wicket.markup.repeater.RefreshingView;
+import org.apache.wicket.model.IModel;
+import org.apache.wicket.spring.injection.annot.SpringBean;
+
+public class SettingsPage extends AbstractBasePage<Void> {
+  @SpringBean
+  private NutchService nutchService;
+
+  private WebMarkupContainer settingsTable;
+
+  public SettingsPage() {
+    settingsTable = new WebMarkupContainer("settingsTable");
+    settingsTable.setOutputMarkupId(true);
+    RefreshingView<NutchConfig> nutchConfig = new RefreshingView<NutchConfig>(
+        "settings") {
+
+      @Override
+      protected Iterator<IModel<NutchConfig>> getItemModels() {
+        return new CpmIteratorAdapter<NutchConfig>(
+            convertNutchConfig(nutchService.getNutchConfig(getCurrentInstance()
+                .getId())));
+      }
+
+      @Override
+      protected void populateItem(Item<NutchConfig> item) {
+        item.add(new Label("name"));
+        item.add(new TextField<String>("value"));
+      }
+    };
+    settingsTable.add(nutchConfig);
+    add(settingsTable);
+  }
+
+  private List<NutchConfig> convertNutchConfig(Map<String, String> map) {
+    List<NutchConfig> listNutchConfigs = new LinkedList<NutchConfig>();
+    for (String key : map.keySet()) {
+      NutchConfig conf = new NutchConfig();
+      conf.setName(key);
+      conf.setValue(map.get(key));
+      listNutchConfigs.add(conf);
+    }
+    return listNutchConfigs;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/service/CrawlService.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/service/CrawlService.java b/nutch-core/src/main/java/org/apache/nutch/webui/service/CrawlService.java
new file mode 100644
index 0000000..c742b48
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/service/CrawlService.java
@@ -0,0 +1,33 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui.service;
+
+import java.util.List;
+
+import org.apache.nutch.webui.client.model.Crawl;
+import org.apache.nutch.webui.model.NutchInstance;
+
+public interface CrawlService {
+
+  public void saveCrawl(Crawl crawl);
+
+  public List<Crawl> getCrawls();
+
+  void startCrawl(Long crawlId, NutchInstance instance);
+
+  void deleteCrawl(Long crawlId);
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/service/NutchInstanceService.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/service/NutchInstanceService.java b/nutch-core/src/main/java/org/apache/nutch/webui/service/NutchInstanceService.java
new file mode 100644
index 0000000..23f27e8
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/service/NutchInstanceService.java
@@ -0,0 +1,33 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui.service;
+
+import java.util.List;
+
+import org.apache.nutch.webui.model.NutchInstance;
+
+public interface NutchInstanceService {
+
+  public List<NutchInstance> getInstances();
+
+  public void saveInstance(NutchInstance instance);
+
+  public void removeInstance(Long id);
+
+  public NutchInstance getInstance(Long id);
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/service/NutchService.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/service/NutchService.java b/nutch-core/src/main/java/org/apache/nutch/webui/service/NutchService.java
new file mode 100644
index 0000000..643236a
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/service/NutchService.java
@@ -0,0 +1,31 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui.service;
+
+import java.util.Map;
+
+import org.apache.nutch.webui.client.model.ConnectionStatus;
+import org.apache.nutch.webui.client.model.NutchStatus;
+
+public interface NutchService {
+  public ConnectionStatus getConnectionStatus(Long instanceId);
+
+  public Map<String, String> getNutchConfig(Long instanceId);
+
+  public NutchStatus getNutchStatus(Long instanceId);
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/service/SeedListService.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/service/SeedListService.java b/nutch-core/src/main/java/org/apache/nutch/webui/service/SeedListService.java
new file mode 100644
index 0000000..dda8c71
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/service/SeedListService.java
@@ -0,0 +1,33 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui.service;
+
+import java.util.List;
+
+import org.apache.nutch.webui.model.SeedList;
+
+public interface SeedListService {
+
+  public void save(SeedList seedList);
+
+  public void delete(Long seedListId);
+
+  public List<SeedList> findAll();
+
+  public SeedList getSeedList(Long seedListId);
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/service/impl/CrawlServiceImpl.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/service/impl/CrawlServiceImpl.java b/nutch-core/src/main/java/org/apache/nutch/webui/service/impl/CrawlServiceImpl.java
new file mode 100644
index 0000000..7bb133b
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/service/impl/CrawlServiceImpl.java
@@ -0,0 +1,132 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui.service.impl;
+
+import java.sql.SQLException;
+import java.util.List;
+
+import javax.annotation.Resource;
+
+import org.apache.nutch.webui.client.NutchClient;
+import org.apache.nutch.webui.client.NutchClientFactory;
+import org.apache.nutch.webui.client.impl.CrawlingCycle;
+import org.apache.nutch.webui.client.impl.RemoteCommandsBatchFactory;
+import org.apache.nutch.webui.client.impl.CrawlingCycleListener;
+import org.apache.nutch.webui.client.impl.RemoteCommand;
+import org.apache.nutch.webui.client.impl.RemoteCommandExecutor;
+import org.apache.nutch.webui.client.model.Crawl;
+import org.apache.nutch.webui.client.model.Crawl.CrawlStatus;
+import org.apache.nutch.webui.model.NutchInstance;
+import org.apache.nutch.webui.service.CrawlService;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.springframework.scheduling.annotation.Async;
+import org.springframework.stereotype.Service;
+
+import com.j256.ormlite.dao.Dao;
+
+@Service
+public class CrawlServiceImpl implements CrawlService, CrawlingCycleListener {
+  private Logger log = LoggerFactory.getLogger(CrawlServiceImpl.class);
+
+  @Resource
+  private Dao<Crawl, Long> crawlDao;
+
+  @Resource
+  private NutchClientFactory nutchClientFactory;
+
+  @Resource
+  private RemoteCommandsBatchFactory commandFactory;
+
+  @Override
+  @Async
+  public void startCrawl(Long crawlId, NutchInstance instance) {
+    Crawl crawl = null;
+    try {
+      crawl = crawlDao.queryForId(crawlId);
+      if(crawl.getCrawlId()==null) {
+        crawl.setCrawlId("crawl-" + crawlId.toString());
+      }
+      NutchClient client = nutchClientFactory.getClient(instance);
+      String seedDirectory = client.createSeed(crawl.getSeedList());
+      crawl.setSeedDirectory(seedDirectory);
+
+      List<RemoteCommand> commands = commandFactory.createCommands(crawl);
+      RemoteCommandExecutor executor = new RemoteCommandExecutor(client);
+
+      CrawlingCycle cycle = new CrawlingCycle(this, executor, crawl, commands);
+      cycle.executeCrawlCycle();
+
+    } catch (Exception e) {
+      crawl.setStatus(CrawlStatus.ERROR);
+      saveCrawl(crawl);
+      log.error("exception occured", e);
+    }
+  }
+
+  @Override
+  public List<Crawl> getCrawls() {
+    try {
+      return crawlDao.queryForAll();
+    } catch (SQLException e) {
+      throw new RuntimeException(e);
+    }
+  }
+
+  @Override
+  public void saveCrawl(Crawl crawl) {
+    try {
+      crawlDao.createOrUpdate(crawl);
+    } catch (SQLException e) {
+      throw new RuntimeException(e);
+    }
+  }
+
+  @Override
+  public void deleteCrawl(Long crawlId) {
+    try {
+      crawlDao.deleteById(crawlId);
+    } catch (SQLException e) {
+      throw new RuntimeException(e);
+    }
+  }
+
+  @Override
+  public void crawlingStarted(Crawl crawl) {
+    crawl.setStatus(CrawlStatus.CRAWLING);
+    crawl.setProgress(0);
+    saveCrawl(crawl);
+  }
+
+  @Override
+  public void onCrawlError(Crawl crawl, String msg) {
+    crawl.setStatus(CrawlStatus.ERROR);
+    saveCrawl(crawl);
+  }
+
+  @Override
+  public void commandExecuted(Crawl crawl, RemoteCommand command, int progress) {
+    crawl.setProgress(progress);
+    saveCrawl(crawl);
+  }
+
+  @Override
+  public void crawlingFinished(Crawl crawl) {
+    crawl.setStatus(CrawlStatus.FINISHED);
+    saveCrawl(crawl);
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/service/impl/NutchInstanceServiceImpl.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/service/impl/NutchInstanceServiceImpl.java b/nutch-core/src/main/java/org/apache/nutch/webui/service/impl/NutchInstanceServiceImpl.java
new file mode 100644
index 0000000..e100054
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/service/impl/NutchInstanceServiceImpl.java
@@ -0,0 +1,76 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui.service.impl;
+
+import java.sql.SQLException;
+import java.util.List;
+
+import javax.annotation.Resource;
+
+import org.apache.nutch.webui.client.NutchClientFactory;
+import org.apache.nutch.webui.model.NutchInstance;
+import org.apache.nutch.webui.service.NutchInstanceService;
+import org.springframework.stereotype.Service;
+
+import com.j256.ormlite.dao.Dao;
+
+@Service
+public class NutchInstanceServiceImpl implements NutchInstanceService {
+
+  @Resource
+  private NutchClientFactory nutchClientFactory;
+
+  @Resource
+  private Dao<NutchInstance, Long> instancesDao;
+
+  @Override
+  public List<NutchInstance> getInstances() {
+    try {
+      return instancesDao.queryForAll();
+    } catch (SQLException e) {
+      throw new RuntimeException(e);
+    }
+
+  }
+
+  @Override
+  public NutchInstance getInstance(Long id) {
+    try {
+      return instancesDao.queryForId(id);
+    } catch (SQLException e) {
+      throw new RuntimeException(e);
+    }
+  }
+
+  @Override
+  public void saveInstance(NutchInstance instance) {
+    try {
+      instancesDao.createOrUpdate(instance);
+    } catch (SQLException e) {
+      throw new RuntimeException(e);
+    }
+  }
+
+  @Override
+  public void removeInstance(Long id) {
+    try {
+      instancesDao.deleteById(id);
+    } catch (SQLException e) {
+      throw new RuntimeException(e);
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/service/impl/NutchServiceImpl.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/service/impl/NutchServiceImpl.java b/nutch-core/src/main/java/org/apache/nutch/webui/service/impl/NutchServiceImpl.java
new file mode 100644
index 0000000..db989cf
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/service/impl/NutchServiceImpl.java
@@ -0,0 +1,82 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui.service.impl;
+
+import java.net.ConnectException;
+import java.util.Collections;
+import java.util.Map;
+
+import javax.annotation.Resource;
+
+import org.apache.nutch.webui.client.NutchClientFactory;
+import org.apache.nutch.webui.client.model.ConnectionStatus;
+import org.apache.nutch.webui.client.model.NutchStatus;
+import org.apache.nutch.webui.model.NutchInstance;
+import org.apache.nutch.webui.service.NutchInstanceService;
+import org.apache.nutch.webui.service.NutchService;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.springframework.stereotype.Service;
+
+import com.sun.jersey.api.client.ClientHandlerException;
+
+@Service
+public class NutchServiceImpl implements NutchService {
+  private static final Logger logger = LoggerFactory
+      .getLogger(NutchServiceImpl.class);
+
+  @Resource
+  private NutchClientFactory nutchClientFactory;
+
+  @Resource
+  private NutchInstanceService instanceService;
+
+  @Override
+  public ConnectionStatus getConnectionStatus(Long instanceId) {
+    NutchInstance instance = instanceService.getInstance(instanceId);
+    try {
+      NutchStatus nutchStatus = nutchClientFactory.getClient(instance)
+          .getNutchStatus();
+      if (nutchStatus.getStartDate() != null) {
+        return ConnectionStatus.CONNECTED;
+      }
+    } catch (Exception e) {
+      if (e.getCause() instanceof ConnectException) {
+        return ConnectionStatus.DISCONNECTED;
+      }
+
+      logger.error("Cannot connect to nutch server!", e);
+    }
+    return null;
+  }
+
+  @Override
+  public Map<String, String> getNutchConfig(Long instanceId) {
+    NutchInstance instance = instanceService.getInstance(instanceId);
+    try {
+      return nutchClientFactory.getClient(instance).getNutchConfig("default");
+    } catch (ClientHandlerException exception) {
+      return Collections.emptyMap();
+    }
+  }
+
+  @Override
+  public NutchStatus getNutchStatus(Long instanceId) {
+    NutchInstance instance = instanceService.getInstance(instanceId);
+    return nutchClientFactory.getClient(instance).getNutchStatus();
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/service/impl/SeedListServiceImpl.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/service/impl/SeedListServiceImpl.java b/nutch-core/src/main/java/org/apache/nutch/webui/service/impl/SeedListServiceImpl.java
new file mode 100644
index 0000000..fced2d3
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/service/impl/SeedListServiceImpl.java
@@ -0,0 +1,77 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui.service.impl;
+
+import java.sql.SQLException;
+import java.util.List;
+
+import javax.annotation.Resource;
+
+import org.apache.nutch.webui.model.SeedList;
+import org.apache.nutch.webui.model.SeedUrl;
+import org.apache.nutch.webui.service.SeedListService;
+import org.springframework.stereotype.Service;
+
+import com.j256.ormlite.dao.Dao;
+
+@Service
+public class SeedListServiceImpl implements SeedListService {
+
+  @Resource
+  private Dao<SeedList, Long> seedListDao;
+
+  @Resource
+  private Dao<SeedUrl, Long> seedUrlDao;
+
+  @Override
+  public void save(SeedList seedList) {
+    try {
+      seedListDao.createOrUpdate(seedList);
+    } catch (SQLException e) {
+      throw new RuntimeException(e);
+    }
+  }
+
+  @Override
+  public void delete(Long seedListId) {
+    try {
+      seedListDao.deleteById(seedListId);
+    } catch (SQLException e) {
+      throw new RuntimeException(e);
+    }
+
+  }
+
+  @Override
+  public List<SeedList> findAll() {
+    try {
+      return seedListDao.queryForAll();
+    } catch (SQLException e) {
+      throw new RuntimeException(e);
+    }
+  }
+
+  @Override
+  public SeedList getSeedList(Long seedListId) {
+    try {
+      return seedListDao.queryForId(seedListId);
+    } catch (SQLException e) {
+      throw new RuntimeException(e);
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/overview.html
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/overview.html b/nutch-core/src/main/java/overview.html
new file mode 100644
index 0000000..1132141
--- /dev/null
+++ b/nutch-core/src/main/java/overview.html
@@ -0,0 +1,9 @@
+<html>
+<head>
+   <title>Apache Nutch</title>
+</head>
+<body>
+<p>Apache Nutch is a highly extensible and scalable open source web crawler software project.</p>
+<p>Nutch is a project of the Apache Software Foundation and is part of the larger Apache community of developers and users.</p>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/crawl/ContinuousCrawlTestUtil.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/crawl/ContinuousCrawlTestUtil.java b/nutch-core/src/test/java/org/apache/nutch/crawl/ContinuousCrawlTestUtil.java
new file mode 100644
index 0000000..bb938a6
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/crawl/ContinuousCrawlTestUtil.java
@@ -0,0 +1,270 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Date;
+import java.util.List;
+
+import junit.framework.TestCase;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.nutch.crawl.CrawlDbUpdateUtil;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.TimingUtil;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Emulate a continuous crawl for one URL.
+ * 
+ */
+public class ContinuousCrawlTestUtil extends TestCase {
+
+  private static final Logger LOG = LoggerFactory
+      .getLogger(ContinuousCrawlTestUtil.class);
+
+  protected static Text dummyURL = new Text("http://nutch.apache.org/");
+
+  protected static Configuration defaultConfig = CrawlDBTestUtil
+      .createConfiguration();
+
+  protected long interval = FetchSchedule.SECONDS_PER_DAY * 1000; // (default)
+                                                                  // launch
+                                                                  // crawler
+                                                                  // every day
+  protected long duration = 2 * 365L * FetchSchedule.SECONDS_PER_DAY * 1000L; // run
+                                                                              // for
+                                                                              // two
+                                                                              // years
+
+  protected Configuration configuration;
+  private FetchSchedule schedule;
+
+  /** status a fetched datum should get */
+  protected byte fetchStatus = CrawlDatum.STATUS_FETCH_SUCCESS;
+  /** expected status of the resulting Db datum */
+  protected byte expectedDbStatus = CrawlDatum.STATUS_DB_FETCHED;
+
+  /** for signature calculation */
+  protected Signature signatureImpl;
+  protected Content content = new Content();
+
+  {
+    byte[] data = { 'n', 'u', 't', 'c', 'h' };
+    content.setContent(data);
+  }
+
+  protected ContinuousCrawlTestUtil(Configuration conf) {
+    configuration = conf;
+    schedule = FetchScheduleFactory.getFetchSchedule(new JobConf(conf));
+    signatureImpl = SignatureFactory.getSignature(conf);
+  }
+
+  protected ContinuousCrawlTestUtil(Configuration conf, byte fetchStatus,
+      byte expectedDbStatus) {
+    this(conf);
+    this.fetchStatus = fetchStatus;
+    this.expectedDbStatus = expectedDbStatus;
+  }
+
+  protected ContinuousCrawlTestUtil() {
+    this(defaultConfig);
+  }
+
+  protected ContinuousCrawlTestUtil(byte fetchStatus, byte expectedDbStatus) {
+    this(defaultConfig, fetchStatus, expectedDbStatus);
+  }
+
+  /** set the interval the crawl is relaunched (default: every day) */
+  protected void setInterval(int seconds) {
+    interval = seconds * 1000L;
+  }
+
+  /** set the duration of the continuous crawl (default = 2 years) */
+  protected void setDuraction(int seconds) {
+    duration = seconds * 1000L;
+  }
+
+  /**
+   * default fetch action: set status and time
+   * 
+   * @param datum
+   *          CrawlDatum to fetch
+   * @param currentTime
+   *          current time used to set the fetch time via
+   *          {@link CrawlDatum#setFetchTime(long)}
+   * @return the modified CrawlDatum
+   */
+  protected CrawlDatum fetch(CrawlDatum datum, long currentTime) {
+    datum.setStatus(fetchStatus);
+    datum.setFetchTime(currentTime);
+    return datum;
+  }
+
+  /**
+   * get signature for content and configured signature implementation
+   */
+  protected byte[] getSignature() {
+    return signatureImpl.calculate(content, null);
+  }
+
+  /**
+   * change content to force a changed signature
+   */
+  protected void changeContent() {
+    byte[] data = Arrays.copyOf(content.getContent(),
+        content.getContent().length + 1);
+    data[content.getContent().length] = '2'; // append one byte
+    content.setContent(data);
+    LOG.info("document content changed");
+  }
+
+  /**
+   * default parse action: add signature if successfully fetched
+   * 
+   * @param fetchDatum
+   *          fetch datum
+   * @return list of all datums resulting from parse (status: signature, linked,
+   *         parse_metadata)
+   */
+  protected List<CrawlDatum> parse(CrawlDatum fetchDatum) {
+    List<CrawlDatum> parseDatums = new ArrayList<CrawlDatum>(0);
+    if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_SUCCESS) {
+      CrawlDatum signatureDatum = new CrawlDatum(CrawlDatum.STATUS_SIGNATURE, 0);
+      signatureDatum.setSignature(getSignature());
+      parseDatums.add(signatureDatum);
+    }
+    return parseDatums;
+  }
+
+  /**
+   * default implementation to check the result state
+   * 
+   * @param datum
+   *          the CrawlDatum to be checked
+   * @return true if the check succeeds
+   */
+  protected boolean check(CrawlDatum datum) {
+    if (datum.getStatus() != expectedDbStatus)
+      return false;
+    return true;
+  }
+
+  /**
+   * Run the continuous crawl.
+   * <p>
+   * A loop emulates a continuous crawl launched in regular intervals (see
+   * {@link #setInterval(int)} over a longer period ({@link #setDuraction(int)}.
+   * 
+   * <ul>
+   * <li>every "round" emulates
+   * <ul>
+   * <li>a fetch (see {@link #fetch(CrawlDatum, long)})</li>
+   * <li>{@literal updatedb} which returns a {@link CrawlDatum}</li>
+   * </ul>
+   * <li>the returned CrawlDatum is used as input for the next round</li>
+   * <li>and is checked whether it is correct (see {@link #check(CrawlDatum)})
+   * </ul>
+   * </p>
+   * 
+   * @param maxErrors
+   *          (if > 0) continue crawl even if the checked CrawlDatum is not
+   *          correct, but stop after max. number of errors
+   * 
+   * @return false if a check of CrawlDatum failed, true otherwise
+   */
+  protected boolean run(int maxErrors) {
+
+    long now = System.currentTimeMillis();
+
+    CrawlDbUpdateUtil<CrawlDbReducer> updateDb = new CrawlDbUpdateUtil<CrawlDbReducer>(
+        new CrawlDbReducer(), configuration);
+
+    /* start with a db_unfetched */
+    CrawlDatum dbDatum = new CrawlDatum();
+    dbDatum.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
+    schedule.initializeSchedule(dummyURL, dbDatum); // initialize fetchInterval
+    dbDatum.setFetchTime(now);
+
+    LOG.info("Emulate a continuous crawl, launched every "
+        + (interval / (FetchSchedule.SECONDS_PER_DAY * 1000)) + " day ("
+        + (interval / 1000) + " seconds)");
+    long maxTime = (now + duration);
+    long nextTime = now;
+    long lastFetchTime = -1;
+    boolean ok = true; // record failure but keep going
+    CrawlDatum fetchDatum = new CrawlDatum();
+    /*
+     * Keep copies because CrawlDbReducer.reduce() and
+     * FetchSchedule.shouldFetch() may alter the references. Copies are used for
+     * verbose logging in case of an error.
+     */
+    CrawlDatum copyDbDatum = new CrawlDatum();
+    CrawlDatum copyFetchDatum = new CrawlDatum();
+    CrawlDatum afterShouldFetch = new CrawlDatum();
+    int errorCount = 0;
+    while (nextTime < maxTime) {
+      LOG.info("check: " + new Date(nextTime));
+      fetchDatum.set(dbDatum);
+      copyDbDatum.set(dbDatum);
+      if (schedule.shouldFetch(dummyURL, fetchDatum, nextTime)) {
+        LOG.info("... fetching now (" + new Date(nextTime) + ")");
+        if (lastFetchTime > -1) {
+          LOG.info("(last fetch: " + new Date(lastFetchTime) + " = "
+              + TimingUtil.elapsedTime(lastFetchTime, nextTime) + " ago)");
+        }
+        lastFetchTime = nextTime;
+        afterShouldFetch.set(fetchDatum);
+        fetchDatum = fetch(fetchDatum, nextTime);
+        copyFetchDatum.set(fetchDatum);
+        List<CrawlDatum> values = new ArrayList<CrawlDatum>();
+        values.add(dbDatum);
+        values.add(fetchDatum);
+        values.addAll(parse(fetchDatum));
+        List<CrawlDatum> res = updateDb.update(values);
+        assertNotNull("null returned", res);
+        assertFalse("no CrawlDatum", 0 == res.size());
+        assertEquals("more than one CrawlDatum", 1, res.size());
+        if (!check(res.get(0))) {
+          LOG.info("previously in CrawlDb: " + copyDbDatum);
+          LOG.info("after shouldFetch(): " + afterShouldFetch);
+          LOG.info("fetch: " + fetchDatum);
+          LOG.warn("wrong result in CrawlDb: " + res.get(0));
+          if (++errorCount >= maxErrors) {
+            if (maxErrors > 0) {
+              LOG.error("Max. number of errors " + maxErrors
+                  + " reached. Stopping.");
+            }
+            return false;
+          } else {
+            ok = false; // record failure but keep going
+          }
+        }
+        /* use the returned CrawlDatum for the next fetch */
+        dbDatum = res.get(0);
+      }
+      nextTime += interval;
+    }
+    return ok;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/crawl/CrawlDBTestUtil.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/crawl/CrawlDBTestUtil.java b/nutch-core/src/test/java/org/apache/nutch/crawl/CrawlDBTestUtil.java
new file mode 100644
index 0000000..56905e4
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/crawl/CrawlDBTestUtil.java
@@ -0,0 +1,179 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.crawl;
+
+import java.io.IOException;
+import java.net.UnknownHostException;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.MapFile;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.MapFile.Writer.Option;
+import org.apache.hadoop.io.Text;
+import org.mortbay.jetty.Server;
+import org.mortbay.jetty.bio.SocketConnector;
+import org.mortbay.jetty.handler.ContextHandler;
+import org.mortbay.jetty.handler.ResourceHandler;
+
+public class CrawlDBTestUtil {
+
+  private static final Logger LOG = LoggerFactory
+      .getLogger(CrawlDBTestUtil.class);
+
+  /**
+   * Creates synthetic crawldb
+   * 
+   * @param fs
+   *          filesystem where db will be created
+   * @param crawldb
+   *          path were db will be created
+   * @param init
+   *          urls to be inserted, objects are of type URLCrawlDatum
+   * @throws Exception
+   */
+  public static void createCrawlDb(Configuration conf, FileSystem fs,
+      Path crawldb, List<URLCrawlDatum> init) throws Exception {
+    LOG.trace("* creating crawldb: " + crawldb);
+    Path dir = new Path(crawldb, CrawlDb.CURRENT_NAME);
+    Option wKeyOpt = MapFile.Writer.keyClass(Text.class);
+    org.apache.hadoop.io.SequenceFile.Writer.Option wValueOpt = SequenceFile.Writer.valueClass(CrawlDatum.class);
+    MapFile.Writer writer = new MapFile.Writer(conf, new Path(dir,
+        "part-r-00000"), wKeyOpt, wValueOpt);
+    Iterator<URLCrawlDatum> it = init.iterator();
+    while (it.hasNext()) {
+      URLCrawlDatum row = it.next();
+      LOG.info("adding:" + row.url.toString());
+      writer.append(new Text(row.url), row.datum);
+    }
+    writer.close();
+  }
+
+  /**
+   * For now we need to manually construct our Configuration, because we need to
+   * override the default one and it is currently not possible to use
+   * dynamically set values.
+   * 
+   * @return
+   * @deprecated Use {@link #createConfiguration()} instead
+   */
+  @Deprecated
+  public static Configuration create() {
+    return createConfiguration();
+  }
+
+  /**
+   * For now we need to manually construct our Configuration, because we need to
+   * override the default one and it is currently not possible to use
+   * dynamically set values.
+   * 
+   * @return
+   */
+  public static Configuration createConfiguration() {
+    Configuration conf = new Configuration();
+    conf.addResource("nutch-default.xml");
+    conf.addResource("crawl-tests.xml");
+    return conf;
+  }
+
+  public static class URLCrawlDatum {
+
+    public Text url;
+
+    public CrawlDatum datum;
+
+    public URLCrawlDatum(Text url, CrawlDatum datum) {
+      this.url = url;
+      this.datum = datum;
+    }
+  }
+
+  /**
+   * Generate seedlist
+   * 
+   * @throws IOException
+   */
+  public static void generateSeedList(FileSystem fs, Path urlPath,
+      List<String> urls) throws IOException {
+    generateSeedList(fs, urlPath, urls, new ArrayList<String>());
+  }
+
+  /**
+   * Generate seedlist
+   * 
+   * @throws IOException
+   */
+  public static void generateSeedList(FileSystem fs, Path urlPath,
+      List<String> urls, List<String> metadata) throws IOException {
+    FSDataOutputStream out;
+    Path file = new Path(urlPath, "urls.txt");
+    fs.mkdirs(urlPath);
+    out = fs.create(file);
+
+    Iterator<String> urls_i = urls.iterator();
+    Iterator<String> metadata_i = metadata.iterator();
+
+    String url;
+    String md;
+    while (urls_i.hasNext()) {
+      url = urls_i.next();
+
+      out.writeBytes(url);
+
+      if (metadata_i.hasNext()) {
+        md = metadata_i.next();
+        out.writeBytes(md);
+      }
+
+      out.writeBytes("\n");
+    }
+
+    out.flush();
+    out.close();
+  }
+
+  /**
+   * Creates a new JettyServer with one static root context
+   * 
+   * @param port
+   *          port to listen to
+   * @param staticContent
+   *          folder where static content lives
+   * @throws UnknownHostException
+   */
+  public static Server getServer(int port, String staticContent)
+      throws UnknownHostException {
+    Server webServer = new org.mortbay.jetty.Server();
+    SocketConnector listener = new SocketConnector();
+    listener.setPort(port);
+    listener.setHost("127.0.0.1");
+    webServer.addConnector(listener);
+    ContextHandler staticContext = new ContextHandler();
+    staticContext.setContextPath("/");
+    staticContext.setResourceBase(staticContent);
+    staticContext.addHandler(new ResourceHandler());
+    webServer.addHandler(staticContext);
+    return webServer;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/crawl/CrawlDbUpdateTestDriver.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/crawl/CrawlDbUpdateTestDriver.java b/nutch-core/src/test/java/org/apache/nutch/crawl/CrawlDbUpdateTestDriver.java
new file mode 100644
index 0000000..7238f88
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/crawl/CrawlDbUpdateTestDriver.java
@@ -0,0 +1,138 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import java.io.IOException;
+import java.net.URI;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configuration.IntegerRanges;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.RawComparator;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Counter;
+import org.apache.hadoop.mapreduce.Counters;
+import org.apache.hadoop.mapreduce.InputFormat;
+import org.apache.hadoop.mapreduce.InputSplit;
+import org.apache.hadoop.mapreduce.JobContext;
+import org.apache.hadoop.mapreduce.JobID;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.OutputCommitter;
+import org.apache.hadoop.mapreduce.OutputFormat;
+import org.apache.hadoop.mapreduce.Partitioner;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.hadoop.mapreduce.Reducer.Context;
+import org.apache.hadoop.mapreduce.TaskAttemptID;
+import org.apache.hadoop.mapreduce.TaskInputOutputContext;
+import org.apache.hadoop.security.Credentials;
+import org.apache.hadoop.util.StringUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.mrunit.mapreduce.ReduceDriver;
+import org.apache.hadoop.mrunit.types.Pair;
+
+/**
+ * Utility to test transitions of {@link CrawlDatum} states during an update of
+ * {@link CrawlDb} (command {@literal updatedb}): call
+ * {@link CrawlDbReducer#reduce(Text, Iterator, OutputCollector, Reporter)}
+ * (using MRUnit) with the old CrawlDatum (db status) and the new one (fetch
+ * status)
+ */
+public class CrawlDbUpdateTestDriver<T extends Reducer<Text, CrawlDatum, Text, CrawlDatum>> {
+
+  private static final Logger LOG = LoggerFactory
+      .getLogger(CrawlDbUpdateTestDriver.class);
+
+  private ReduceDriver<Text, CrawlDatum, Text, CrawlDatum> reduceDriver;
+  private T reducer;
+  private Configuration configuration;
+
+  public static Text dummyURL = new Text("http://nutch.apache.org/");
+
+//  protected CrawlDbUpdateUtilNewAPI(T red, T.Context con) {
+  protected CrawlDbUpdateTestDriver(T updateReducer, Configuration conf) {
+    reducer = updateReducer;
+    configuration = conf;
+  }
+
+  /**
+   * run
+   * {@link CrawlDbReducer#reduce(Text, Iterator, OutputCollector, Reporter)}
+   * and return the CrawlDatum(s) which would have been written into CrawlDb
+   *
+   * @param values
+   *          list of input CrawlDatums
+   * @return list of resulting CrawlDatum(s) in CrawlDb
+   */
+  public List<CrawlDatum> update(List<CrawlDatum> values) {
+    List<CrawlDatum> result = new ArrayList<CrawlDatum>(0);
+    if (values == null || values.size() == 0) {
+      return result;
+    }
+    Collections.shuffle(values); // sorting of values should have no influence
+    reduceDriver = ReduceDriver.newReduceDriver(reducer);
+    reduceDriver.setConfiguration(configuration);
+    reduceDriver.withInput(dummyURL, values);
+    List<Pair<Text,CrawlDatum>> reduceResult;
+    try {
+      reduceResult = reduceDriver.run();
+      for (Pair<Text,CrawlDatum> p : reduceResult) {
+        if (p.getFirst().equals(dummyURL)) {
+          result.add(p.getSecond());
+        }
+      }
+    } catch (IOException e) {
+      LOG.error(StringUtils.stringifyException(e));
+      return result;
+    }
+    return result;
+  }
+
+  /**
+   * run
+   * {@link CrawlDbReducer#reduce(Text, Iterator, OutputCollector, Reporter)}
+   * and return the CrawlDatum(s) which would have been written into CrawlDb
+   *
+   * @param dbDatum
+   *          previous CrawlDatum in CrawlDb
+   * @param fetchDatum
+   *          CrawlDatum resulting from fetching
+   * @return list of resulting CrawlDatum(s) in CrawlDb
+   */
+  public List<CrawlDatum> update(CrawlDatum dbDatum, CrawlDatum fetchDatum) {
+    List<CrawlDatum> values = new ArrayList<CrawlDatum>();
+    if (dbDatum != null)
+      values.add(dbDatum);
+    if (fetchDatum != null)
+      values.add(fetchDatum);
+    return update(values);
+  }
+
+  /**
+   * see {@link #update(List)}
+   */
+  public List<CrawlDatum> update(CrawlDatum... values) {
+    return update(Arrays.asList(values));
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/crawl/CrawlDbUpdateUtil.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/crawl/CrawlDbUpdateUtil.java b/nutch-core/src/test/java/org/apache/nutch/crawl/CrawlDbUpdateUtil.java
new file mode 100644
index 0000000..bfb716d
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/crawl/CrawlDbUpdateUtil.java
@@ -0,0 +1,166 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.Counters;
+import org.apache.hadoop.mapred.Counters.Counter;
+import org.apache.hadoop.mapred.InputSplit;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reducer;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.hadoop.util.StringUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Utility to test transitions of {@link CrawlDatum} states during an update of
+ * {@link CrawlDb} (command {@literal updatedb}): call
+ * {@link CrawlDbReducer#reduce(Text, Iterator, OutputCollector, Reporter)} with
+ * the old CrawlDatum (db status) and the new one (fetch status)
+ */
+public class CrawlDbUpdateUtil<T extends Reducer<Text, CrawlDatum, Text, CrawlDatum>> {
+
+  private static final Logger LOG = LoggerFactory
+      .getLogger(CrawlDbUpdateUtil.class);
+
+  private T reducer;
+
+  public static Text dummyURL = new Text("http://nutch.apache.org/");
+
+  protected CrawlDbUpdateUtil(T red, Configuration conf) {
+    reducer = red;
+    reducer.configure(new JobConf(conf));
+  }
+
+  /** {@link OutputCollector} to collect all values in a {@link List} */
+  private class ListOutputCollector implements
+      OutputCollector<Text, CrawlDatum> {
+
+    private List<CrawlDatum> values = new ArrayList<CrawlDatum>();
+
+    public void collect(Text key, CrawlDatum value) throws IOException {
+      values.add(value);
+    }
+
+    /** collected values as list */
+    public List<CrawlDatum> getValues() {
+      return values;
+    }
+
+  }
+
+  /**
+   * Dummy reporter which does nothing and does not return null for getCounter()
+   * 
+   * @see {@link Reporter#NULL}
+   */
+  private class DummyReporter implements Reporter {
+
+    private Counters dummyCounters = new Counters();
+
+    public void progress() {
+    }
+
+    public Counter getCounter(Enum<?> arg0) {
+      return dummyCounters.getGroup("dummy").getCounterForName("dummy");
+    }
+
+    public Counter getCounter(String arg0, String arg1) {
+      return dummyCounters.getGroup("dummy").getCounterForName("dummy");
+    }
+
+    public InputSplit getInputSplit() throws UnsupportedOperationException {
+      throw new UnsupportedOperationException("Dummy reporter without input");
+    }
+
+    public void incrCounter(Enum<?> arg0, long arg1) {
+    }
+
+    public void incrCounter(String arg0, String arg1, long arg2) {
+    }
+
+    public void setStatus(String arg0) {
+    }
+
+    public float getProgress() {
+      return 1f;
+    }
+
+  }
+
+  /**
+   * run
+   * {@link CrawlDbReducer#reduce(Text, Iterator, OutputCollector, Reporter)}
+   * and return the CrawlDatum(s) which would have been written into CrawlDb
+   * 
+   * @param values
+   *          list of input CrawlDatums
+   * @return list of resulting CrawlDatum(s) in CrawlDb
+   */
+  public List<CrawlDatum> update(List<CrawlDatum> values) {
+    if (values == null || values.size() == 0) {
+      return new ArrayList<CrawlDatum>(0);
+    }
+    Collections.shuffle(values); // sorting of values should have no influence
+    ListOutputCollector output = new ListOutputCollector();
+    try {
+      reducer.reduce(dummyURL, values.iterator(), output, new DummyReporter());
+    } catch (IOException e) {
+      LOG.error(StringUtils.stringifyException(e));
+    }
+    return output.getValues();
+  }
+
+  /**
+   * run
+   * {@link CrawlDbReducer#reduce(Text, Iterator, OutputCollector, Reporter)}
+   * and return the CrawlDatum(s) which would have been written into CrawlDb
+   * 
+   * @param dbDatum
+   *          previous CrawlDatum in CrawlDb
+   * @param fetchDatum
+   *          CrawlDatum resulting from fetching
+   * @return list of resulting CrawlDatum(s) in CrawlDb
+   */
+  public List<CrawlDatum> update(CrawlDatum dbDatum, CrawlDatum fetchDatum) {
+    List<CrawlDatum> values = new ArrayList<CrawlDatum>();
+    if (dbDatum != null)
+      values.add(dbDatum);
+    if (fetchDatum != null)
+      values.add(fetchDatum);
+    return update(values);
+  }
+
+  /**
+   * see {@link #update(List)}
+   */
+  public List<CrawlDatum> update(CrawlDatum... values) {
+    return update(Arrays.asList(values));
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/crawl/DummyWritable.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/crawl/DummyWritable.java b/nutch-core/src/test/java/org/apache/nutch/crawl/DummyWritable.java
new file mode 100644
index 0000000..94c27b5
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/crawl/DummyWritable.java
@@ -0,0 +1,32 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import org.apache.hadoop.io.IntWritable;
+
+public class DummyWritable extends IntWritable {
+
+  public DummyWritable() {
+
+  }
+
+  public DummyWritable(int i) {
+    super(i);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/crawl/TODOTestCrawlDbStates.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/crawl/TODOTestCrawlDbStates.java b/nutch-core/src/test/java/org/apache/nutch/crawl/TODOTestCrawlDbStates.java
new file mode 100644
index 0000000..fd88c7d
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/crawl/TODOTestCrawlDbStates.java
@@ -0,0 +1,171 @@
+package org.apache.nutch.crawl;
+
+import static org.apache.nutch.crawl.CrawlDatum.*;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.test.IntegrationTest;
+import org.apache.nutch.util.TimingUtil;
+
+import static org.junit.Assert.*;
+
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+@Category({ IntegrationTest.class})
+public class TODOTestCrawlDbStates extends TestCrawlDbStates {
+
+  private static final Logger LOG = LoggerFactory
+      .getLogger(TODOTestCrawlDbStates.class);
+
+  /**
+   * NUTCH-578: a fetch_retry should result in a db_gone if db.fetch.retry.max
+   * is reached. Retry counter has to be reset appropriately.
+   */
+  @Test
+  public void testCrawlDbReducerPageRetrySchedule() {
+    LOG.info("NUTCH-578: test long running continuous crawl with fetch_retry");
+    ContinuousCrawlTestUtil crawlUtil = new ContinuousCrawlTestFetchRetry();
+    // keep going for long, to "provoke" a retry counter overflow
+    if (!crawlUtil.run(150)) {
+      fail("fetch_retry did not result in a db_gone if retry counter > maxRetries (NUTCH-578)");
+    }
+  }
+
+  private class ContinuousCrawlTestFetchRetry extends ContinuousCrawlTestUtil {
+
+    private int retryMax = 3;
+    private int totalRetries = 0;
+
+    ContinuousCrawlTestFetchRetry() {
+      super();
+      fetchStatus = STATUS_FETCH_RETRY;
+      retryMax = configuration.getInt("db.fetch.retry.max", retryMax);
+    }
+
+    @Override
+    protected CrawlDatum fetch(CrawlDatum datum, long currentTime) {
+      datum.setStatus(fetchStatus);
+      datum.setFetchTime(currentTime);
+      totalRetries++;
+      return datum;
+    }
+
+    @Override
+    protected boolean check(CrawlDatum result) {
+      if (result.getRetriesSinceFetch() > retryMax) {
+        LOG.warn("Retry counter > db.fetch.retry.max: " + result);
+      } else if (result.getRetriesSinceFetch() == Byte.MAX_VALUE) {
+        LOG.warn("Retry counter max. value reached (overflow imminent): "
+            + result);
+      } else if (result.getRetriesSinceFetch() < 0) {
+        LOG.error("Retry counter overflow: " + result);
+        return false;
+      }
+      // use retry counter bound to this class (totalRetries)
+      // instead of result.getRetriesSinceFetch() because the retry counter
+      // in CrawlDatum could be reset (eg. NUTCH-578_v5.patch)
+      if (totalRetries < retryMax) {
+        if (result.getStatus() == STATUS_DB_UNFETCHED) {
+          LOG.info("ok: " + result);
+          result.getRetriesSinceFetch();
+          return true;
+        }
+      } else {
+        if (result.getStatus() == STATUS_DB_GONE) {
+          LOG.info("ok: " + result);
+          return true;
+        }
+      }
+      LOG.warn("wrong: " + result);
+      return false;
+    }
+
+  }
+
+  /**
+   * NUTCH-1564 AdaptiveFetchSchedule: sync_delta forces immediate re-fetch for
+   * documents not modified
+   * <p>
+   * Problem: documents not modified for a longer time are fetched in every
+   * cycle because of an error in the SYNC_DELTA calculation of
+   * {@link AdaptiveFetchSchedule}. <br>
+   * The next fetch time should always be in the future, never in the past.
+   * </p>
+   */
+  @Test
+  public void testAdaptiveFetchScheduleSyncDelta() {
+    LOG.info("NUTCH-1564 test SYNC_DELTA calculation of AdaptiveFetchSchedule");
+    Configuration conf = CrawlDBTestUtil.createConfiguration();
+    conf.setLong("db.fetch.interval.default", 172800); // 2 days
+    conf.setLong("db.fetch.schedule.adaptive.min_interval", 86400); // 1 day
+    conf.setLong("db.fetch.schedule.adaptive.max_interval", 604800); // 7 days
+    conf.setLong("db.fetch.interval.max", 604800); // 7 days
+    conf.set("db.fetch.schedule.class",
+        "org.apache.nutch.crawl.AdaptiveFetchSchedule");
+    ContinuousCrawlTestUtil crawlUtil = new CrawlTestFetchScheduleNotModifiedFetchTime(
+        conf);
+    crawlUtil.setInterval(FetchSchedule.SECONDS_PER_DAY / 3);
+    if (!crawlUtil.run(100)) {
+      fail("failed: sync_delta calculation with AdaptiveFetchSchedule");
+    }
+  }
+
+  private class CrawlTestFetchScheduleNotModifiedFetchTime extends
+      CrawlTestFetchNotModified {
+
+    // time of current fetch
+    private long fetchTime;
+
+    private long minInterval;
+    private long maxInterval;
+
+    CrawlTestFetchScheduleNotModifiedFetchTime(Configuration conf) {
+      super(conf);
+      minInterval = conf.getLong("db.fetch.schedule.adaptive.min_interval",
+          86400); // 1 day
+      maxInterval = conf.getLong("db.fetch.schedule.adaptive.max_interval",
+          604800); // 7 days
+      if (conf.getLong("db.fetch.interval.max", 604800) < maxInterval) {
+        maxInterval = conf.getLong("db.fetch.interval.max", 604800);
+      }
+    }
+
+    @Override
+    protected CrawlDatum fetch(CrawlDatum datum, long currentTime) {
+      // remember time of fetching
+      fetchTime = currentTime;
+      return super.fetch(datum, currentTime);
+    }
+
+    @Override
+    protected boolean check(CrawlDatum result) {
+      if (result.getStatus() == STATUS_DB_NOTMODIFIED) {
+        // check only status notmodified here
+        long secondsUntilNextFetch = (result.getFetchTime() - fetchTime) / 1000L;
+        if (secondsUntilNextFetch < -1) {
+          // next fetch time is in the past (more than one second)
+          LOG.error("Next fetch time is in the past: " + result);
+          return false;
+        }
+        if (secondsUntilNextFetch < 60) {
+          // next fetch time is in less than one minute
+          // (critical: Nutch can hardly be so fast)
+          LOG.error("Less then one minute until next fetch: " + result);
+        }
+        // Next fetch time should be within min. and max. (tolerance: 60 sec.)
+        if (secondsUntilNextFetch + 60 < minInterval
+            || secondsUntilNextFetch - 60 > maxInterval) {
+          LOG.error("Interval until next fetch time ("
+              + TimingUtil.elapsedTime(fetchTime, result.getFetchTime())
+              + ") is not within min. and max. interval: " + result);
+          // TODO: is this a failure?
+        }
+      }
+      return true;
+    }
+
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/crawl/TestAdaptiveFetchSchedule.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/crawl/TestAdaptiveFetchSchedule.java b/nutch-core/src/test/java/org/apache/nutch/crawl/TestAdaptiveFetchSchedule.java
new file mode 100644
index 0000000..3fa798d
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/crawl/TestAdaptiveFetchSchedule.java
@@ -0,0 +1,121 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.crawl;
+
+import junit.framework.TestCase;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Before;
+import org.junit.Test;
+
+/**
+ * Test cases for AdaptiveFetchSchedule.
+ * 
+ */
+public class TestAdaptiveFetchSchedule extends TestCase {
+
+  private float inc_rate;
+  private float dec_rate;
+  private Configuration conf;
+  private long curTime, lastModified;
+  private int changed, interval, calculateInterval;
+
+  @Before
+  public void setUp() throws Exception {
+    super.setUp();
+    conf = NutchConfiguration.create();
+    inc_rate = conf.getFloat("db.fetch.schedule.adaptive.inc_rate", 0.2f);
+    dec_rate = conf.getFloat("db.fetch.schedule.adaptive.dec_rate", 0.2f);
+    interval = 100;
+    lastModified = 0;
+  }
+
+  /**
+   * Test the core functionality of AdaptiveFetchSchedule.
+   * 
+   */
+
+  @Test
+  public void testAdaptiveFetchSchedule() {
+
+    FetchSchedule fs = new AdaptiveFetchSchedule();
+    fs.setConf(conf);
+
+    CrawlDatum p = prepareCrawlDatum();
+    Text url = new Text("http://www.example.com");
+
+    changed = FetchSchedule.STATUS_UNKNOWN;
+    fs.setFetchSchedule(url, p, p.getFetchTime(), p.getModifiedTime(), curTime,
+        lastModified, changed);
+    validateFetchInterval(changed, p.getFetchInterval());
+
+    changed = FetchSchedule.STATUS_MODIFIED;
+    fs.setFetchSchedule(url, p, p.getFetchTime(), p.getModifiedTime(), curTime,
+        lastModified, changed);
+    validateFetchInterval(changed, p.getFetchInterval());
+    p.setFetchInterval(interval);
+
+    changed = FetchSchedule.STATUS_NOTMODIFIED;
+    fs.setFetchSchedule(url, p, p.getFetchTime(), p.getModifiedTime(), curTime,
+        lastModified, changed);
+    validateFetchInterval(changed, p.getFetchInterval());
+
+  }
+
+  /**
+   * Prepare a CrawlDatum (STATUS_DB_UNFETCHED) to Test AdaptiveFetchSchedule.
+   * 
+   * @return properly initialized CrawlDatum
+   */
+  public CrawlDatum prepareCrawlDatum() {
+    CrawlDatum p = new CrawlDatum();
+    p.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
+    p.setFetchInterval(interval);
+    p.setScore(1.0f);
+    p.setFetchTime(0);
+    return p;
+  }
+
+  /**
+   * 
+   * The Method validates interval values according to changed parameter.
+   * 
+   * @param changed
+   *          status value to check calculated interval value.
+   * @param getInterval
+   *          to test IntervalValue from CrawlDatum which is calculated via
+   *          AdaptiveFetchSchedule algorithm.
+   */
+  private void validateFetchInterval(int changed, int getInterval) {
+
+    if (changed == FetchSchedule.STATUS_UNKNOWN) {
+      assertEquals(getInterval, interval);
+
+    } else if (changed == FetchSchedule.STATUS_MODIFIED) {
+      calculateInterval = (int) (interval - (interval * dec_rate));
+      assertEquals(getInterval, calculateInterval);
+
+    } else if (changed == FetchSchedule.STATUS_NOTMODIFIED) {
+      calculateInterval = (int) (interval + (interval * inc_rate));
+      assertEquals(getInterval, calculateInterval);
+    }
+
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbFilter.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbFilter.java b/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbFilter.java
new file mode 100644
index 0000000..773dd29
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbFilter.java
@@ -0,0 +1,148 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import java.io.IOException;
+import java.util.ArrayList;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.SequenceFile.Reader.Option;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.*;
+import org.apache.nutch.crawl.CrawlDBTestUtil.URLCrawlDatum;
+import org.apache.nutch.test.IntegrationTest;
+import org.apache.nutch.util.NutchJob;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+
+/**
+ * CrawlDbFiltering test which tests for correct, error free url normalization
+ * when the CrawlDB includes urls with <code>DB GONE</code> status and
+ * <code>CRAWLDB_PURGE_404</code> is set to true.
+ * 
+ * @author lufeng
+ */
+public class TestCrawlDbFilter {
+  Configuration conf;
+  Path dbDir;
+  Path newCrawlDb;
+  final static Path testdir = new Path("build/test/crawldbfilter-test");
+  FileSystem fs;
+
+  @Before
+  public void setUp() throws Exception {
+    conf = CrawlDBTestUtil.createConfiguration();
+    fs = FileSystem.get(conf);
+    fs.delete(testdir, true);
+  }
+
+  @After
+  public void tearDown() {
+    delete(testdir);
+  }
+
+  private void delete(Path p) {
+    try {
+      fs.delete(p, true);
+    } catch (IOException e) {
+    }
+  }
+
+  /**
+   * Test url404Purging
+   * 
+   * @throws Exception
+   */
+  @Test
+  @Category({IntegrationTest.class})
+  public void testUrl404Purging() throws Exception {
+    // create a CrawlDatum with DB GONE status
+    ArrayList<URLCrawlDatum> list = new ArrayList<URLCrawlDatum>();
+    list.add(new URLCrawlDatum(new Text("http://www.example.com"),
+        new CrawlDatum(CrawlDatum.STATUS_DB_GONE, 0, 0.0f)));
+    list.add(new URLCrawlDatum(new Text("http://www.example1.com"),
+        new CrawlDatum(CrawlDatum.STATUS_DB_FETCHED, 0, 0.0f)));
+    list.add(new URLCrawlDatum(new Text("http://www.example2.com"),
+        new CrawlDatum(CrawlDatum.STATUS_DB_UNFETCHED, 0, 0.0f)));
+    dbDir = new Path(testdir, "crawldb");
+    newCrawlDb = new Path(testdir, "newcrawldb");
+    // create crawldb
+    CrawlDBTestUtil.createCrawlDb(conf, fs, dbDir, list);
+    // set CRAWLDB_PURGE_404 to true
+    conf.setBoolean(CrawlDb.CRAWLDB_PURGE_404, true);
+    conf.setBoolean(CrawlDbFilter.URL_NORMALIZING, true);
+    conf.setBoolean(CrawlDbFilter.URL_FILTERING, false);
+    conf.setInt("urlnormalizer.loop.count", 2);
+    JobConf job = new NutchJob(conf);
+    job.setJobName("Test CrawlDbFilter");
+    Path current = new Path(dbDir, "current");
+    if (FileSystem.get(job).exists(current)) {
+      FileInputFormat.addInputPath(job, current);
+    }
+    job.setInputFormat(SequenceFileInputFormat.class);
+    job.setMapperClass(CrawlDbFilter.class);
+    job.setReducerClass(CrawlDbReducer.class);
+    FileOutputFormat.setOutputPath(job, newCrawlDb);
+    job.setOutputFormat(MapFileOutputFormat.class);
+    job.setOutputKeyClass(Text.class);
+    job.setOutputValueClass(CrawlDatum.class);
+    JobClient.runJob(job);
+
+    Path fetchlist = new Path(new Path(newCrawlDb, "part-00000"), "data");
+
+    ArrayList<URLCrawlDatum> l = readContents(fetchlist);
+
+    // verify we got right amount of records
+    Assert.assertEquals(2, l.size());
+  }
+
+  /**
+   * Read contents of fetchlist.
+   * 
+   * @param fetchlist
+   *          path to Generated fetchlist
+   * @return Generated {@link URLCrawlDatum} objects
+   * @throws IOException
+   */
+  private ArrayList<URLCrawlDatum> readContents(Path fetchlist)
+      throws IOException {
+    // verify results
+    Option fFile = SequenceFile.Reader.file(fetchlist);
+    SequenceFile.Reader reader = new SequenceFile.Reader(conf, fFile);
+
+    ArrayList<URLCrawlDatum> l = new ArrayList<URLCrawlDatum>();
+
+    READ: do {
+      Text key = new Text();
+      CrawlDatum value = new CrawlDatum();
+      if (!reader.next(key, value)) {
+        break READ;
+      }
+      l.add(new URLCrawlDatum(key, value));
+    } while (true);
+
+    reader.close();
+    return l;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbMerger.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbMerger.java b/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbMerger.java
new file mode 100644
index 0000000..599c353
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbMerger.java
@@ -0,0 +1,163 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.TreeSet;
+import java.util.logging.Logger;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.MapFile;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.MapFile.Writer.Option;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.nutch.test.IntegrationTest;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+
+public class TestCrawlDbMerger {
+  private static final Logger LOG = Logger.getLogger(CrawlDbMerger.class
+      .getName());
+
+  String url10 = "http://example.com/";
+  String url11 = "http://example.com/foo";
+  String url20 = "http://example.com/";
+  String url21 = "http://example.com/bar";
+  String[] urls_expected = new String[] { url10, url11, url21 };
+
+  TreeSet<String> init1 = new TreeSet<String>();
+  TreeSet<String> init2 = new TreeSet<String>();
+  HashMap<String, CrawlDatum> expected = new HashMap<String, CrawlDatum>();
+  CrawlDatum cd1, cd2, cd3;
+  Configuration conf;
+  FileSystem fs;
+  Path testDir;
+  CrawlDbReader reader;
+
+  @Before
+  public void setUp() throws Exception {
+    init1.add(url10);
+    init1.add(url11);
+    init2.add(url20);
+    init2.add(url21);
+    long time = System.currentTimeMillis();
+    cd1 = new CrawlDatum();
+    cd1.setFetchInterval(1.0f);
+    cd1.setFetchTime(time);
+    cd1.getMetaData().put(new Text("name"), new Text("cd1"));
+    cd1.getMetaData().put(new Text("cd1"), new Text("cd1"));
+    cd2 = new CrawlDatum();
+    cd2.setFetchInterval(1.0f);
+    cd2.setFetchTime(time + 10000);
+    cd2.getMetaData().put(new Text("name"), new Text("cd2"));
+    cd3 = new CrawlDatum();
+    cd3.setFetchInterval(1.0f);
+    cd3.setFetchTime(time + 10000);
+    cd3.getMetaData().putAll(cd1.getMetaData());
+    cd3.getMetaData().putAll(cd2.getMetaData());
+    expected.put(url10, cd3);
+    expected.put(url11, cd1);
+    expected.put(url21, cd2);
+    conf = NutchConfiguration.create();
+    fs = FileSystem.get(conf);
+    testDir = new Path("test-crawldb-" + new java.util.Random().nextInt());
+    fs.mkdirs(testDir);
+  }
+
+  @After
+  public void tearDown() {
+    try {
+      if (fs.exists(testDir))
+        fs.delete(testDir, true);
+    } catch (Exception e) {
+    }
+    try {
+      reader.close();
+    } catch (Exception e) {
+    }
+  }
+
+  /**
+   * Test creates two sample {@link org.apache.nutch.crawl.CrawlDb}'s
+   * populating entries for keys as {@link org.apache.hadoop.io.Text} e.g. URLs 
+   * and values as {@link org.apache.nutch.crawl.CrawlDatum} e.g. record data. 
+   * It then simulates a merge process for the two CrawlDb's via the {@link org.apache.nutch.crawl.CrawlDbMerger}
+   * tool. The merged CrawlDb is then written to an arbitrary output location and the results
+   * read using the {@link org.apache.nutch.crawl.CrawlDbReader} tool. 
+   * Test assertions include comparing expected CrawlDb key, value (URL, CrawlDatum) values
+   * with actual results based on the merge process. 
+   * @throws Exception
+   */
+  @Test
+  @Category({IntegrationTest.class})
+  public void testMerge() throws Exception {
+    Path crawldb1 = new Path(testDir, "crawldb1");
+    Path crawldb2 = new Path(testDir, "crawldb2");
+    Path output = new Path(testDir, "output");
+    createCrawlDb(conf, fs, crawldb1, init1, cd1);
+    createCrawlDb(conf, fs, crawldb2, init2, cd2);
+    CrawlDbMerger merger = new CrawlDbMerger(conf);
+    LOG.fine("* merging crawldbs to " + output);
+    merger.merge(output, new Path[] { crawldb1, crawldb2 }, false, false);
+    LOG.fine("* reading crawldb: " + output);
+    reader = new CrawlDbReader();
+    String crawlDb = output.toString();
+    Iterator<String> it = expected.keySet().iterator();
+    while (it.hasNext()) {
+      String url = it.next();
+      LOG.fine("url=" + url);
+      CrawlDatum cd = expected.get(url);
+      CrawlDatum res = reader.get(crawlDb, url, new JobConf(conf));
+      LOG.fine(" -> " + res);
+      System.out.println("url=" + url);
+      System.out.println(" cd " + cd);
+      System.out.println(" res " + res);
+      // may not be null
+      Assert.assertNotNull(res);
+      Assert.assertTrue(cd.equals(res));
+    }
+    reader.close();
+    fs.delete(testDir, true);
+  }
+
+  private void createCrawlDb(Configuration config, FileSystem fs, Path crawldb,
+      TreeSet<String> init, CrawlDatum cd) throws Exception {
+    LOG.fine("* creating crawldb: " + crawldb);
+    Path dir = new Path(crawldb, CrawlDb.CURRENT_NAME);
+    
+    Option wKeyOpt = MapFile.Writer.keyClass(Text.class);
+    org.apache.hadoop.io.SequenceFile.Writer.Option wValueOpt = SequenceFile.Writer.valueClass(CrawlDatum.class);
+    
+    MapFile.Writer writer = new MapFile.Writer(config, new Path(dir,
+        "part-r-00000"), wKeyOpt, wValueOpt);
+    Iterator<String> it = init.iterator();
+    while (it.hasNext()) {
+      String key = it.next();
+      writer.append(new Text(key), cd);
+    }
+    writer.close();
+  }
+}