You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by th...@apache.org on 2016/07/16 19:48:46 UTC
[30/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build
for nutch-core and nutch-plugins
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/pages/seed/SeedPage.html
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/pages/seed/SeedPage.html b/nutch-core/src/main/java/org/apache/nutch/webui/pages/seed/SeedPage.html
new file mode 100644
index 0000000..d6f33a3
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/pages/seed/SeedPage.html
@@ -0,0 +1,91 @@
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
+ license agreements. See the NOTICE file distributed with this work for additional
+ information regarding copyright ownership. The ASF licenses this file to
+ You under the Apache License, Version 2.0 (the "License"); you may not use
+ this file except in compliance with the License. You may obtain a copy of
+ the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
+ by applicable law or agreed to in writing, software distributed under the
+ License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
+ OF ANY KIND, either express or implied. See the License for the specific
+ language governing permissions and limitations under the License. -->
+<!DOCTYPE html>
+<html xmlns:wicket="http://wicket.apache.org">
+<head>
+<meta charset="utf-8" />
+<title>Wicket extend</title>
+</head>
+
+<body>
+ <wicket:extend>
+ <h2>
+ <wicket:message key="page.header.seedList">Seed list</wicket:message>
+ </h2>
+
+ <div class="row">
+ <div class="col-lg-8">
+ <form class="form-horizontal" wicket:id="seedList">
+ <fieldset>
+ <!-- Text input-->
+ <div class="form-group">
+ <label class="col-md-4 control-label" for="textinput">Seed list name</label>
+ <div class="col-md-4">
+ <input wicket:id="name" name="textinput" class="form-control input-md" type="text">
+ </div>
+ </div>
+ <div class="form-group">
+ <div class="col-md-offset-4 col-md-4">
+ <button type="submit" class="btn btn-primary">Save</button>
+ </div>
+ </div>
+ </fieldset>
+ </form>
+ <h3>Seed urls</h3>
+ <table class="table table-hover table-striped tablesorter">
+ <thead>
+ <tr>
+ <th class="header col-md-3">Url</th>
+ <th></th>
+ </tr>
+ </thead>
+
+ <tbody wicket:id="seedUrlsTable">
+ <tr wicket:id="seedUrls">
+ <td>
+ <span wicket:id="url">http://google.com</span>
+ </td>
+ <td>
+ <button wicket:id="delete" class="btn btn-sm btn-danger" type="button">
+ <span class="fa fa-trash-o"></span>
+ </button>
+ </td>
+ </tr>
+ </tbody>
+ </table>
+ <form class="form-horizontal" wicket:id="urlForm">
+ <fieldset>
+ <div class="form-group">
+ <div class="col-md-4">
+ <input wicket:id="url" name="textinput" class="form-control input-md" type="text">
+ </div>
+ <div>
+ <button wicket:id="addUrl" class="btn btn-primary">Add url</button>
+ </div>
+ </div>
+ </fieldset>
+ </form>
+ </div>
+ <div class="col-lg-4">
+ <div class="panel panel-primary">
+ <div class="panel-heading">
+ <h3 class="panel-title">Help</h3>
+ </div>
+ <div class="panel-body">
+ <p>Some help about seed management</p>
+ </div>
+ </div>
+ </div>
+ </div>
+ <!--row-->
+ </wicket:extend>
+</body>
+</html>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/pages/seed/SeedPage.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/pages/seed/SeedPage.java b/nutch-core/src/main/java/org/apache/nutch/webui/pages/seed/SeedPage.java
new file mode 100644
index 0000000..fba07ab
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/pages/seed/SeedPage.java
@@ -0,0 +1,153 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui.pages.seed;
+
+import java.util.Iterator;
+
+import org.apache.nutch.webui.model.SeedList;
+import org.apache.nutch.webui.model.SeedUrl;
+import org.apache.nutch.webui.pages.AbstractBasePage;
+import org.apache.nutch.webui.pages.components.CpmIteratorAdapter;
+import org.apache.nutch.webui.service.SeedListService;
+import org.apache.wicket.ajax.AjaxRequestTarget;
+import org.apache.wicket.ajax.markup.html.AjaxLink;
+import org.apache.wicket.ajax.markup.html.form.AjaxSubmitLink;
+import org.apache.wicket.markup.html.WebMarkupContainer;
+import org.apache.wicket.markup.html.basic.Label;
+import org.apache.wicket.markup.html.form.Form;
+import org.apache.wicket.markup.html.form.TextField;
+import org.apache.wicket.markup.repeater.Item;
+import org.apache.wicket.markup.repeater.RefreshingView;
+import org.apache.wicket.model.CompoundPropertyModel;
+import org.apache.wicket.model.IModel;
+import org.apache.wicket.model.LoadableDetachableModel;
+import org.apache.wicket.model.Model;
+import org.apache.wicket.request.mapper.parameter.PageParameters;
+import org.apache.wicket.spring.injection.annot.SpringBean;
+
+import com.google.common.collect.Lists;
+
+/**
+ * This page is for seed urls management
+ *
+ * @author feodor
+ *
+ */
+public class SeedPage extends AbstractBasePage<SeedList> {
+
+ @SpringBean
+ private SeedListService seedListService;
+
+ private Form<SeedUrl> urlForm;
+
+ private WebMarkupContainer seedUrlsTable;
+
+ public SeedPage() {
+ SeedList list = new SeedList();
+ list.setSeedUrls(Lists.<SeedUrl> newArrayList());
+ initPage(Model.of(list));
+ }
+
+ public SeedPage(final PageParameters parameters) {
+ initPage(new LoadableDetachableModel<SeedList>() {
+
+ @Override
+ protected SeedList load() {
+ Long seedListId = parameters.get("id").toLongObject();
+ return seedListService.getSeedList(seedListId);
+ }
+ });
+ }
+
+ public void initPage(IModel<SeedList> model) {
+ setModel(new CompoundPropertyModel<SeedList>(model));
+
+ addBaseForm();
+ addSeedUrlsList();
+ addUrlForm();
+ }
+
+ private void addBaseForm() {
+ Form<SeedList> form = new Form<SeedList>("seedList", getModel()) {
+ @Override
+ protected void onSubmit() {
+ seedListService.save(getModelObject());
+ setResponsePage(SeedListsPage.class);
+ }
+ };
+ form.add(new TextField<String>("name"));
+ add(form);
+ }
+
+ private void addSeedUrlsList() {
+ seedUrlsTable = new WebMarkupContainer("seedUrlsTable");
+ seedUrlsTable.setOutputMarkupId(true);
+
+ RefreshingView<SeedUrl> seedUrls = new RefreshingView<SeedUrl>("seedUrls") {
+
+ @Override
+ protected Iterator<IModel<SeedUrl>> getItemModels() {
+ return new CpmIteratorAdapter<SeedUrl>(getModelObject().getSeedUrls());
+ }
+
+ @Override
+ protected void populateItem(Item<SeedUrl> item) {
+ item.add(new Label("url"));
+ item.add(new AjaxLink<SeedUrl>("delete", item.getModel()) {
+
+ @Override
+ public void onClick(AjaxRequestTarget target) {
+ deleteSeedUrl(getModelObject());
+ target.add(seedUrlsTable);
+ }
+ });
+ }
+ };
+ seedUrlsTable.add(seedUrls);
+ add(seedUrlsTable);
+ }
+
+ private void addUrlForm() {
+ urlForm = new Form<SeedUrl>("urlForm", CompoundPropertyModel.of(Model
+ .of(new SeedUrl())));
+ urlForm.setOutputMarkupId(true);
+ urlForm.add(new TextField<String>("url"));
+ urlForm.add(new AjaxSubmitLink("addUrl", urlForm) {
+ @Override
+ protected void onSubmit(AjaxRequestTarget target, Form<?> form) {
+ addSeedUrl();
+ urlForm.setModelObject(new SeedUrl());
+ target.add(urlForm);
+ target.add(seedUrlsTable);
+ }
+ });
+ add(urlForm);
+ }
+
+ private void addSeedUrl() {
+ SeedUrl url = urlForm.getModelObject();
+ SeedList seedList = getModelObject();
+ url.setSeedList(seedList);
+ seedList.getSeedUrls().add(url);
+ }
+
+ private void deleteSeedUrl(SeedUrl url) {
+ SeedList seedList = getModelObject();
+ seedList.getSeedUrls().remove(url);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/pages/settings/SettingsPage.html
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/pages/settings/SettingsPage.html b/nutch-core/src/main/java/org/apache/nutch/webui/pages/settings/SettingsPage.html
new file mode 100644
index 0000000..8810371
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/pages/settings/SettingsPage.html
@@ -0,0 +1,43 @@
+<!DOCTYPE html>
+<html xmlns:wicket="http://wicket.apache.org">
+<head>
+<meta charset="utf-8" />
+<title>Wicket extend</title>
+</head>
+
+<body>
+ <wicket:extend>
+ <h2>
+ <wicket:message key="settings">Settings</wicket:message>
+ </h2>
+ <div class="row">
+ <div class="col-lg-12">
+ <table class="table table-hover tablesorter table-bordered">
+ <thead>
+ <tr>
+ <th class="header col-lg-3">
+ <wicket:message key="settings.header.name">Name</wicket:message>
+ </th>
+ <th class="header col-lg-9">
+ <wicket:message key="settings.header.value">Value</wicket:message>
+ </th>
+ </tr>
+ </thead>
+ <tbody wicket:id="settingsTable">
+ <tr wicket:id="settings">
+ <td>
+ <span wicket:id="name">Name</span>
+ </td>
+ <td>
+<!-- <span wicket:id="value">Value</span> -->
+ <input class="col-lg-12" wicket:id="value" placeholder="http://localhost:8080">
+ </td>
+ </tr>
+ </tbody>
+ </table>
+ </div>
+ </div>
+ </wicket:extend>
+
+</body>
+</html>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/pages/settings/SettingsPage.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/pages/settings/SettingsPage.java b/nutch-core/src/main/java/org/apache/nutch/webui/pages/settings/SettingsPage.java
new file mode 100644
index 0000000..29e46f7
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/pages/settings/SettingsPage.java
@@ -0,0 +1,59 @@
+package org.apache.nutch.webui.pages.settings;
+
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.nutch.webui.model.NutchConfig;
+import org.apache.nutch.webui.pages.AbstractBasePage;
+import org.apache.nutch.webui.pages.components.CpmIteratorAdapter;
+import org.apache.nutch.webui.service.NutchService;
+import org.apache.wicket.markup.html.WebMarkupContainer;
+import org.apache.wicket.markup.html.basic.Label;
+import org.apache.wicket.markup.html.form.TextField;
+import org.apache.wicket.markup.repeater.Item;
+import org.apache.wicket.markup.repeater.RefreshingView;
+import org.apache.wicket.model.IModel;
+import org.apache.wicket.spring.injection.annot.SpringBean;
+
+public class SettingsPage extends AbstractBasePage<Void> {
+ @SpringBean
+ private NutchService nutchService;
+
+ private WebMarkupContainer settingsTable;
+
+ public SettingsPage() {
+ settingsTable = new WebMarkupContainer("settingsTable");
+ settingsTable.setOutputMarkupId(true);
+ RefreshingView<NutchConfig> nutchConfig = new RefreshingView<NutchConfig>(
+ "settings") {
+
+ @Override
+ protected Iterator<IModel<NutchConfig>> getItemModels() {
+ return new CpmIteratorAdapter<NutchConfig>(
+ convertNutchConfig(nutchService.getNutchConfig(getCurrentInstance()
+ .getId())));
+ }
+
+ @Override
+ protected void populateItem(Item<NutchConfig> item) {
+ item.add(new Label("name"));
+ item.add(new TextField<String>("value"));
+ }
+ };
+ settingsTable.add(nutchConfig);
+ add(settingsTable);
+ }
+
+ private List<NutchConfig> convertNutchConfig(Map<String, String> map) {
+ List<NutchConfig> listNutchConfigs = new LinkedList<NutchConfig>();
+ for (String key : map.keySet()) {
+ NutchConfig conf = new NutchConfig();
+ conf.setName(key);
+ conf.setValue(map.get(key));
+ listNutchConfigs.add(conf);
+ }
+ return listNutchConfigs;
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/service/CrawlService.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/service/CrawlService.java b/nutch-core/src/main/java/org/apache/nutch/webui/service/CrawlService.java
new file mode 100644
index 0000000..c742b48
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/service/CrawlService.java
@@ -0,0 +1,33 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui.service;
+
+import java.util.List;
+
+import org.apache.nutch.webui.client.model.Crawl;
+import org.apache.nutch.webui.model.NutchInstance;
+
+public interface CrawlService {
+
+ public void saveCrawl(Crawl crawl);
+
+ public List<Crawl> getCrawls();
+
+ void startCrawl(Long crawlId, NutchInstance instance);
+
+ void deleteCrawl(Long crawlId);
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/service/NutchInstanceService.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/service/NutchInstanceService.java b/nutch-core/src/main/java/org/apache/nutch/webui/service/NutchInstanceService.java
new file mode 100644
index 0000000..23f27e8
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/service/NutchInstanceService.java
@@ -0,0 +1,33 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui.service;
+
+import java.util.List;
+
+import org.apache.nutch.webui.model.NutchInstance;
+
+public interface NutchInstanceService {
+
+ public List<NutchInstance> getInstances();
+
+ public void saveInstance(NutchInstance instance);
+
+ public void removeInstance(Long id);
+
+ public NutchInstance getInstance(Long id);
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/service/NutchService.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/service/NutchService.java b/nutch-core/src/main/java/org/apache/nutch/webui/service/NutchService.java
new file mode 100644
index 0000000..643236a
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/service/NutchService.java
@@ -0,0 +1,31 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui.service;
+
+import java.util.Map;
+
+import org.apache.nutch.webui.client.model.ConnectionStatus;
+import org.apache.nutch.webui.client.model.NutchStatus;
+
+public interface NutchService {
+ public ConnectionStatus getConnectionStatus(Long instanceId);
+
+ public Map<String, String> getNutchConfig(Long instanceId);
+
+ public NutchStatus getNutchStatus(Long instanceId);
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/service/SeedListService.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/service/SeedListService.java b/nutch-core/src/main/java/org/apache/nutch/webui/service/SeedListService.java
new file mode 100644
index 0000000..dda8c71
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/service/SeedListService.java
@@ -0,0 +1,33 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui.service;
+
+import java.util.List;
+
+import org.apache.nutch.webui.model.SeedList;
+
+public interface SeedListService {
+
+ public void save(SeedList seedList);
+
+ public void delete(Long seedListId);
+
+ public List<SeedList> findAll();
+
+ public SeedList getSeedList(Long seedListId);
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/service/impl/CrawlServiceImpl.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/service/impl/CrawlServiceImpl.java b/nutch-core/src/main/java/org/apache/nutch/webui/service/impl/CrawlServiceImpl.java
new file mode 100644
index 0000000..7bb133b
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/service/impl/CrawlServiceImpl.java
@@ -0,0 +1,132 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui.service.impl;
+
+import java.sql.SQLException;
+import java.util.List;
+
+import javax.annotation.Resource;
+
+import org.apache.nutch.webui.client.NutchClient;
+import org.apache.nutch.webui.client.NutchClientFactory;
+import org.apache.nutch.webui.client.impl.CrawlingCycle;
+import org.apache.nutch.webui.client.impl.RemoteCommandsBatchFactory;
+import org.apache.nutch.webui.client.impl.CrawlingCycleListener;
+import org.apache.nutch.webui.client.impl.RemoteCommand;
+import org.apache.nutch.webui.client.impl.RemoteCommandExecutor;
+import org.apache.nutch.webui.client.model.Crawl;
+import org.apache.nutch.webui.client.model.Crawl.CrawlStatus;
+import org.apache.nutch.webui.model.NutchInstance;
+import org.apache.nutch.webui.service.CrawlService;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.springframework.scheduling.annotation.Async;
+import org.springframework.stereotype.Service;
+
+import com.j256.ormlite.dao.Dao;
+
+@Service
+public class CrawlServiceImpl implements CrawlService, CrawlingCycleListener {
+ private Logger log = LoggerFactory.getLogger(CrawlServiceImpl.class);
+
+ @Resource
+ private Dao<Crawl, Long> crawlDao;
+
+ @Resource
+ private NutchClientFactory nutchClientFactory;
+
+ @Resource
+ private RemoteCommandsBatchFactory commandFactory;
+
+ @Override
+ @Async
+ public void startCrawl(Long crawlId, NutchInstance instance) {
+ Crawl crawl = null;
+ try {
+ crawl = crawlDao.queryForId(crawlId);
+ if(crawl.getCrawlId()==null) {
+ crawl.setCrawlId("crawl-" + crawlId.toString());
+ }
+ NutchClient client = nutchClientFactory.getClient(instance);
+ String seedDirectory = client.createSeed(crawl.getSeedList());
+ crawl.setSeedDirectory(seedDirectory);
+
+ List<RemoteCommand> commands = commandFactory.createCommands(crawl);
+ RemoteCommandExecutor executor = new RemoteCommandExecutor(client);
+
+ CrawlingCycle cycle = new CrawlingCycle(this, executor, crawl, commands);
+ cycle.executeCrawlCycle();
+
+ } catch (Exception e) {
+ crawl.setStatus(CrawlStatus.ERROR);
+ saveCrawl(crawl);
+ log.error("exception occured", e);
+ }
+ }
+
+ @Override
+ public List<Crawl> getCrawls() {
+ try {
+ return crawlDao.queryForAll();
+ } catch (SQLException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ @Override
+ public void saveCrawl(Crawl crawl) {
+ try {
+ crawlDao.createOrUpdate(crawl);
+ } catch (SQLException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ @Override
+ public void deleteCrawl(Long crawlId) {
+ try {
+ crawlDao.deleteById(crawlId);
+ } catch (SQLException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ @Override
+ public void crawlingStarted(Crawl crawl) {
+ crawl.setStatus(CrawlStatus.CRAWLING);
+ crawl.setProgress(0);
+ saveCrawl(crawl);
+ }
+
+ @Override
+ public void onCrawlError(Crawl crawl, String msg) {
+ crawl.setStatus(CrawlStatus.ERROR);
+ saveCrawl(crawl);
+ }
+
+ @Override
+ public void commandExecuted(Crawl crawl, RemoteCommand command, int progress) {
+ crawl.setProgress(progress);
+ saveCrawl(crawl);
+ }
+
+ @Override
+ public void crawlingFinished(Crawl crawl) {
+ crawl.setStatus(CrawlStatus.FINISHED);
+ saveCrawl(crawl);
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/service/impl/NutchInstanceServiceImpl.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/service/impl/NutchInstanceServiceImpl.java b/nutch-core/src/main/java/org/apache/nutch/webui/service/impl/NutchInstanceServiceImpl.java
new file mode 100644
index 0000000..e100054
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/service/impl/NutchInstanceServiceImpl.java
@@ -0,0 +1,76 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui.service.impl;
+
+import java.sql.SQLException;
+import java.util.List;
+
+import javax.annotation.Resource;
+
+import org.apache.nutch.webui.client.NutchClientFactory;
+import org.apache.nutch.webui.model.NutchInstance;
+import org.apache.nutch.webui.service.NutchInstanceService;
+import org.springframework.stereotype.Service;
+
+import com.j256.ormlite.dao.Dao;
+
+@Service
+public class NutchInstanceServiceImpl implements NutchInstanceService {
+
+ @Resource
+ private NutchClientFactory nutchClientFactory;
+
+ @Resource
+ private Dao<NutchInstance, Long> instancesDao;
+
+ @Override
+ public List<NutchInstance> getInstances() {
+ try {
+ return instancesDao.queryForAll();
+ } catch (SQLException e) {
+ throw new RuntimeException(e);
+ }
+
+ }
+
+ @Override
+ public NutchInstance getInstance(Long id) {
+ try {
+ return instancesDao.queryForId(id);
+ } catch (SQLException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ @Override
+ public void saveInstance(NutchInstance instance) {
+ try {
+ instancesDao.createOrUpdate(instance);
+ } catch (SQLException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ @Override
+ public void removeInstance(Long id) {
+ try {
+ instancesDao.deleteById(id);
+ } catch (SQLException e) {
+ throw new RuntimeException(e);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/service/impl/NutchServiceImpl.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/service/impl/NutchServiceImpl.java b/nutch-core/src/main/java/org/apache/nutch/webui/service/impl/NutchServiceImpl.java
new file mode 100644
index 0000000..db989cf
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/service/impl/NutchServiceImpl.java
@@ -0,0 +1,82 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui.service.impl;
+
+import java.net.ConnectException;
+import java.util.Collections;
+import java.util.Map;
+
+import javax.annotation.Resource;
+
+import org.apache.nutch.webui.client.NutchClientFactory;
+import org.apache.nutch.webui.client.model.ConnectionStatus;
+import org.apache.nutch.webui.client.model.NutchStatus;
+import org.apache.nutch.webui.model.NutchInstance;
+import org.apache.nutch.webui.service.NutchInstanceService;
+import org.apache.nutch.webui.service.NutchService;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.springframework.stereotype.Service;
+
+import com.sun.jersey.api.client.ClientHandlerException;
+
+@Service
+public class NutchServiceImpl implements NutchService {
+ private static final Logger logger = LoggerFactory
+ .getLogger(NutchServiceImpl.class);
+
+ @Resource
+ private NutchClientFactory nutchClientFactory;
+
+ @Resource
+ private NutchInstanceService instanceService;
+
+ @Override
+ public ConnectionStatus getConnectionStatus(Long instanceId) {
+ NutchInstance instance = instanceService.getInstance(instanceId);
+ try {
+ NutchStatus nutchStatus = nutchClientFactory.getClient(instance)
+ .getNutchStatus();
+ if (nutchStatus.getStartDate() != null) {
+ return ConnectionStatus.CONNECTED;
+ }
+ } catch (Exception e) {
+ if (e.getCause() instanceof ConnectException) {
+ return ConnectionStatus.DISCONNECTED;
+ }
+
+ logger.error("Cannot connect to nutch server!", e);
+ }
+ return null;
+ }
+
+ @Override
+ public Map<String, String> getNutchConfig(Long instanceId) {
+ NutchInstance instance = instanceService.getInstance(instanceId);
+ try {
+ return nutchClientFactory.getClient(instance).getNutchConfig("default");
+ } catch (ClientHandlerException exception) {
+ return Collections.emptyMap();
+ }
+ }
+
+ @Override
+ public NutchStatus getNutchStatus(Long instanceId) {
+ NutchInstance instance = instanceService.getInstance(instanceId);
+ return nutchClientFactory.getClient(instance).getNutchStatus();
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/service/impl/SeedListServiceImpl.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/service/impl/SeedListServiceImpl.java b/nutch-core/src/main/java/org/apache/nutch/webui/service/impl/SeedListServiceImpl.java
new file mode 100644
index 0000000..fced2d3
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/webui/service/impl/SeedListServiceImpl.java
@@ -0,0 +1,77 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.webui.service.impl;
+
+import java.sql.SQLException;
+import java.util.List;
+
+import javax.annotation.Resource;
+
+import org.apache.nutch.webui.model.SeedList;
+import org.apache.nutch.webui.model.SeedUrl;
+import org.apache.nutch.webui.service.SeedListService;
+import org.springframework.stereotype.Service;
+
+import com.j256.ormlite.dao.Dao;
+
+@Service
+public class SeedListServiceImpl implements SeedListService {
+
+ @Resource
+ private Dao<SeedList, Long> seedListDao;
+
+ @Resource
+ private Dao<SeedUrl, Long> seedUrlDao;
+
+ @Override
+ public void save(SeedList seedList) {
+ try {
+ seedListDao.createOrUpdate(seedList);
+ } catch (SQLException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ @Override
+ public void delete(Long seedListId) {
+ try {
+ seedListDao.deleteById(seedListId);
+ } catch (SQLException e) {
+ throw new RuntimeException(e);
+ }
+
+ }
+
+ @Override
+ public List<SeedList> findAll() {
+ try {
+ return seedListDao.queryForAll();
+ } catch (SQLException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ @Override
+ public SeedList getSeedList(Long seedListId) {
+ try {
+ return seedListDao.queryForId(seedListId);
+ } catch (SQLException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/overview.html
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/overview.html b/nutch-core/src/main/java/overview.html
new file mode 100644
index 0000000..1132141
--- /dev/null
+++ b/nutch-core/src/main/java/overview.html
@@ -0,0 +1,9 @@
+<html>
+<head>
+ <title>Apache Nutch</title>
+</head>
+<body>
+<p>Apache Nutch is a highly extensible and scalable open source web crawler software project.</p>
+<p>Nutch is a project of the Apache Software Foundation and is part of the larger Apache community of developers and users.</p>
+</body>
+</html>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/crawl/ContinuousCrawlTestUtil.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/crawl/ContinuousCrawlTestUtil.java b/nutch-core/src/test/java/org/apache/nutch/crawl/ContinuousCrawlTestUtil.java
new file mode 100644
index 0000000..bb938a6
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/crawl/ContinuousCrawlTestUtil.java
@@ -0,0 +1,270 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Date;
+import java.util.List;
+
+import junit.framework.TestCase;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.nutch.crawl.CrawlDbUpdateUtil;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.TimingUtil;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Emulate a continuous crawl for one URL.
+ *
+ */
+public class ContinuousCrawlTestUtil extends TestCase {
+
+ private static final Logger LOG = LoggerFactory
+ .getLogger(ContinuousCrawlTestUtil.class);
+
+ protected static Text dummyURL = new Text("http://nutch.apache.org/");
+
+ protected static Configuration defaultConfig = CrawlDBTestUtil
+ .createConfiguration();
+
+ protected long interval = FetchSchedule.SECONDS_PER_DAY * 1000; // (default)
+ // launch
+ // crawler
+ // every day
+ protected long duration = 2 * 365L * FetchSchedule.SECONDS_PER_DAY * 1000L; // run
+ // for
+ // two
+ // years
+
+ protected Configuration configuration;
+ private FetchSchedule schedule;
+
+ /** status a fetched datum should get */
+ protected byte fetchStatus = CrawlDatum.STATUS_FETCH_SUCCESS;
+ /** expected status of the resulting Db datum */
+ protected byte expectedDbStatus = CrawlDatum.STATUS_DB_FETCHED;
+
+ /** for signature calculation */
+ protected Signature signatureImpl;
+ protected Content content = new Content();
+
+ {
+ byte[] data = { 'n', 'u', 't', 'c', 'h' };
+ content.setContent(data);
+ }
+
+ protected ContinuousCrawlTestUtil(Configuration conf) {
+ configuration = conf;
+ schedule = FetchScheduleFactory.getFetchSchedule(new JobConf(conf));
+ signatureImpl = SignatureFactory.getSignature(conf);
+ }
+
+ protected ContinuousCrawlTestUtil(Configuration conf, byte fetchStatus,
+ byte expectedDbStatus) {
+ this(conf);
+ this.fetchStatus = fetchStatus;
+ this.expectedDbStatus = expectedDbStatus;
+ }
+
+ protected ContinuousCrawlTestUtil() {
+ this(defaultConfig);
+ }
+
+ protected ContinuousCrawlTestUtil(byte fetchStatus, byte expectedDbStatus) {
+ this(defaultConfig, fetchStatus, expectedDbStatus);
+ }
+
+ /** set the interval the crawl is relaunched (default: every day) */
+ protected void setInterval(int seconds) {
+ interval = seconds * 1000L;
+ }
+
+ /** set the duration of the continuous crawl (default = 2 years) */
+ protected void setDuraction(int seconds) {
+ duration = seconds * 1000L;
+ }
+
+ /**
+ * default fetch action: set status and time
+ *
+ * @param datum
+ * CrawlDatum to fetch
+ * @param currentTime
+ * current time used to set the fetch time via
+ * {@link CrawlDatum#setFetchTime(long)}
+ * @return the modified CrawlDatum
+ */
+ protected CrawlDatum fetch(CrawlDatum datum, long currentTime) {
+ datum.setStatus(fetchStatus);
+ datum.setFetchTime(currentTime);
+ return datum;
+ }
+
+ /**
+ * get signature for content and configured signature implementation
+ */
+ protected byte[] getSignature() {
+ return signatureImpl.calculate(content, null);
+ }
+
+ /**
+ * change content to force a changed signature
+ */
+ protected void changeContent() {
+ byte[] data = Arrays.copyOf(content.getContent(),
+ content.getContent().length + 1);
+ data[content.getContent().length] = '2'; // append one byte
+ content.setContent(data);
+ LOG.info("document content changed");
+ }
+
+ /**
+ * default parse action: add signature if successfully fetched
+ *
+ * @param fetchDatum
+ * fetch datum
+ * @return list of all datums resulting from parse (status: signature, linked,
+ * parse_metadata)
+ */
+ protected List<CrawlDatum> parse(CrawlDatum fetchDatum) {
+ List<CrawlDatum> parseDatums = new ArrayList<CrawlDatum>(0);
+ if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_SUCCESS) {
+ CrawlDatum signatureDatum = new CrawlDatum(CrawlDatum.STATUS_SIGNATURE, 0);
+ signatureDatum.setSignature(getSignature());
+ parseDatums.add(signatureDatum);
+ }
+ return parseDatums;
+ }
+
+ /**
+ * default implementation to check the result state
+ *
+ * @param datum
+ * the CrawlDatum to be checked
+ * @return true if the check succeeds
+ */
+ protected boolean check(CrawlDatum datum) {
+ if (datum.getStatus() != expectedDbStatus)
+ return false;
+ return true;
+ }
+
+ /**
+ * Run the continuous crawl.
+ * <p>
+ * A loop emulates a continuous crawl launched in regular intervals (see
+ * {@link #setInterval(int)} over a longer period ({@link #setDuraction(int)}.
+ *
+ * <ul>
+ * <li>every "round" emulates
+ * <ul>
+ * <li>a fetch (see {@link #fetch(CrawlDatum, long)})</li>
+ * <li>{@literal updatedb} which returns a {@link CrawlDatum}</li>
+ * </ul>
+ * <li>the returned CrawlDatum is used as input for the next round</li>
+ * <li>and is checked whether it is correct (see {@link #check(CrawlDatum)})
+ * </ul>
+ * </p>
+ *
+ * @param maxErrors
+ * (if > 0) continue crawl even if the checked CrawlDatum is not
+ * correct, but stop after max. number of errors
+ *
+ * @return false if a check of CrawlDatum failed, true otherwise
+ */
+ protected boolean run(int maxErrors) {
+
+ long now = System.currentTimeMillis();
+
+ CrawlDbUpdateUtil<CrawlDbReducer> updateDb = new CrawlDbUpdateUtil<CrawlDbReducer>(
+ new CrawlDbReducer(), configuration);
+
+ /* start with a db_unfetched */
+ CrawlDatum dbDatum = new CrawlDatum();
+ dbDatum.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
+ schedule.initializeSchedule(dummyURL, dbDatum); // initialize fetchInterval
+ dbDatum.setFetchTime(now);
+
+ LOG.info("Emulate a continuous crawl, launched every "
+ + (interval / (FetchSchedule.SECONDS_PER_DAY * 1000)) + " day ("
+ + (interval / 1000) + " seconds)");
+ long maxTime = (now + duration);
+ long nextTime = now;
+ long lastFetchTime = -1;
+ boolean ok = true; // record failure but keep going
+ CrawlDatum fetchDatum = new CrawlDatum();
+ /*
+ * Keep copies because CrawlDbReducer.reduce() and
+ * FetchSchedule.shouldFetch() may alter the references. Copies are used for
+ * verbose logging in case of an error.
+ */
+ CrawlDatum copyDbDatum = new CrawlDatum();
+ CrawlDatum copyFetchDatum = new CrawlDatum();
+ CrawlDatum afterShouldFetch = new CrawlDatum();
+ int errorCount = 0;
+ while (nextTime < maxTime) {
+ LOG.info("check: " + new Date(nextTime));
+ fetchDatum.set(dbDatum);
+ copyDbDatum.set(dbDatum);
+ if (schedule.shouldFetch(dummyURL, fetchDatum, nextTime)) {
+ LOG.info("... fetching now (" + new Date(nextTime) + ")");
+ if (lastFetchTime > -1) {
+ LOG.info("(last fetch: " + new Date(lastFetchTime) + " = "
+ + TimingUtil.elapsedTime(lastFetchTime, nextTime) + " ago)");
+ }
+ lastFetchTime = nextTime;
+ afterShouldFetch.set(fetchDatum);
+ fetchDatum = fetch(fetchDatum, nextTime);
+ copyFetchDatum.set(fetchDatum);
+ List<CrawlDatum> values = new ArrayList<CrawlDatum>();
+ values.add(dbDatum);
+ values.add(fetchDatum);
+ values.addAll(parse(fetchDatum));
+ List<CrawlDatum> res = updateDb.update(values);
+ assertNotNull("null returned", res);
+ assertFalse("no CrawlDatum", 0 == res.size());
+ assertEquals("more than one CrawlDatum", 1, res.size());
+ if (!check(res.get(0))) {
+ LOG.info("previously in CrawlDb: " + copyDbDatum);
+ LOG.info("after shouldFetch(): " + afterShouldFetch);
+ LOG.info("fetch: " + fetchDatum);
+ LOG.warn("wrong result in CrawlDb: " + res.get(0));
+ if (++errorCount >= maxErrors) {
+ if (maxErrors > 0) {
+ LOG.error("Max. number of errors " + maxErrors
+ + " reached. Stopping.");
+ }
+ return false;
+ } else {
+ ok = false; // record failure but keep going
+ }
+ }
+ /* use the returned CrawlDatum for the next fetch */
+ dbDatum = res.get(0);
+ }
+ nextTime += interval;
+ }
+ return ok;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/crawl/CrawlDBTestUtil.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/crawl/CrawlDBTestUtil.java b/nutch-core/src/test/java/org/apache/nutch/crawl/CrawlDBTestUtil.java
new file mode 100644
index 0000000..56905e4
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/crawl/CrawlDBTestUtil.java
@@ -0,0 +1,179 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.crawl;
+
+import java.io.IOException;
+import java.net.UnknownHostException;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.MapFile;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.MapFile.Writer.Option;
+import org.apache.hadoop.io.Text;
+import org.mortbay.jetty.Server;
+import org.mortbay.jetty.bio.SocketConnector;
+import org.mortbay.jetty.handler.ContextHandler;
+import org.mortbay.jetty.handler.ResourceHandler;
+
+public class CrawlDBTestUtil {
+
+ private static final Logger LOG = LoggerFactory
+ .getLogger(CrawlDBTestUtil.class);
+
+ /**
+ * Creates synthetic crawldb
+ *
+ * @param fs
+ * filesystem where db will be created
+ * @param crawldb
+ * path were db will be created
+ * @param init
+ * urls to be inserted, objects are of type URLCrawlDatum
+ * @throws Exception
+ */
+ public static void createCrawlDb(Configuration conf, FileSystem fs,
+ Path crawldb, List<URLCrawlDatum> init) throws Exception {
+ LOG.trace("* creating crawldb: " + crawldb);
+ Path dir = new Path(crawldb, CrawlDb.CURRENT_NAME);
+ Option wKeyOpt = MapFile.Writer.keyClass(Text.class);
+ org.apache.hadoop.io.SequenceFile.Writer.Option wValueOpt = SequenceFile.Writer.valueClass(CrawlDatum.class);
+ MapFile.Writer writer = new MapFile.Writer(conf, new Path(dir,
+ "part-r-00000"), wKeyOpt, wValueOpt);
+ Iterator<URLCrawlDatum> it = init.iterator();
+ while (it.hasNext()) {
+ URLCrawlDatum row = it.next();
+ LOG.info("adding:" + row.url.toString());
+ writer.append(new Text(row.url), row.datum);
+ }
+ writer.close();
+ }
+
+ /**
+ * For now we need to manually construct our Configuration, because we need to
+ * override the default one and it is currently not possible to use
+ * dynamically set values.
+ *
+ * @return
+ * @deprecated Use {@link #createConfiguration()} instead
+ */
+ @Deprecated
+ public static Configuration create() {
+ return createConfiguration();
+ }
+
+ /**
+ * For now we need to manually construct our Configuration, because we need to
+ * override the default one and it is currently not possible to use
+ * dynamically set values.
+ *
+ * @return
+ */
+ public static Configuration createConfiguration() {
+ Configuration conf = new Configuration();
+ conf.addResource("nutch-default.xml");
+ conf.addResource("crawl-tests.xml");
+ return conf;
+ }
+
+ public static class URLCrawlDatum {
+
+ public Text url;
+
+ public CrawlDatum datum;
+
+ public URLCrawlDatum(Text url, CrawlDatum datum) {
+ this.url = url;
+ this.datum = datum;
+ }
+ }
+
+ /**
+ * Generate seedlist
+ *
+ * @throws IOException
+ */
+ public static void generateSeedList(FileSystem fs, Path urlPath,
+ List<String> urls) throws IOException {
+ generateSeedList(fs, urlPath, urls, new ArrayList<String>());
+ }
+
+ /**
+ * Generate seedlist
+ *
+ * @throws IOException
+ */
+ public static void generateSeedList(FileSystem fs, Path urlPath,
+ List<String> urls, List<String> metadata) throws IOException {
+ FSDataOutputStream out;
+ Path file = new Path(urlPath, "urls.txt");
+ fs.mkdirs(urlPath);
+ out = fs.create(file);
+
+ Iterator<String> urls_i = urls.iterator();
+ Iterator<String> metadata_i = metadata.iterator();
+
+ String url;
+ String md;
+ while (urls_i.hasNext()) {
+ url = urls_i.next();
+
+ out.writeBytes(url);
+
+ if (metadata_i.hasNext()) {
+ md = metadata_i.next();
+ out.writeBytes(md);
+ }
+
+ out.writeBytes("\n");
+ }
+
+ out.flush();
+ out.close();
+ }
+
+ /**
+ * Creates a new JettyServer with one static root context
+ *
+ * @param port
+ * port to listen to
+ * @param staticContent
+ * folder where static content lives
+ * @throws UnknownHostException
+ */
+ public static Server getServer(int port, String staticContent)
+ throws UnknownHostException {
+ Server webServer = new org.mortbay.jetty.Server();
+ SocketConnector listener = new SocketConnector();
+ listener.setPort(port);
+ listener.setHost("127.0.0.1");
+ webServer.addConnector(listener);
+ ContextHandler staticContext = new ContextHandler();
+ staticContext.setContextPath("/");
+ staticContext.setResourceBase(staticContent);
+ staticContext.addHandler(new ResourceHandler());
+ webServer.addHandler(staticContext);
+ return webServer;
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/crawl/CrawlDbUpdateTestDriver.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/crawl/CrawlDbUpdateTestDriver.java b/nutch-core/src/test/java/org/apache/nutch/crawl/CrawlDbUpdateTestDriver.java
new file mode 100644
index 0000000..7238f88
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/crawl/CrawlDbUpdateTestDriver.java
@@ -0,0 +1,138 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import java.io.IOException;
+import java.net.URI;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configuration.IntegerRanges;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.RawComparator;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Counter;
+import org.apache.hadoop.mapreduce.Counters;
+import org.apache.hadoop.mapreduce.InputFormat;
+import org.apache.hadoop.mapreduce.InputSplit;
+import org.apache.hadoop.mapreduce.JobContext;
+import org.apache.hadoop.mapreduce.JobID;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.OutputCommitter;
+import org.apache.hadoop.mapreduce.OutputFormat;
+import org.apache.hadoop.mapreduce.Partitioner;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.hadoop.mapreduce.Reducer.Context;
+import org.apache.hadoop.mapreduce.TaskAttemptID;
+import org.apache.hadoop.mapreduce.TaskInputOutputContext;
+import org.apache.hadoop.security.Credentials;
+import org.apache.hadoop.util.StringUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.mrunit.mapreduce.ReduceDriver;
+import org.apache.hadoop.mrunit.types.Pair;
+
+/**
+ * Utility to test transitions of {@link CrawlDatum} states during an update of
+ * {@link CrawlDb} (command {@literal updatedb}): call
+ * {@link CrawlDbReducer#reduce(Text, Iterator, OutputCollector, Reporter)}
+ * (using MRUnit) with the old CrawlDatum (db status) and the new one (fetch
+ * status)
+ */
+public class CrawlDbUpdateTestDriver<T extends Reducer<Text, CrawlDatum, Text, CrawlDatum>> {
+
+ private static final Logger LOG = LoggerFactory
+ .getLogger(CrawlDbUpdateTestDriver.class);
+
+ private ReduceDriver<Text, CrawlDatum, Text, CrawlDatum> reduceDriver;
+ private T reducer;
+ private Configuration configuration;
+
+ public static Text dummyURL = new Text("http://nutch.apache.org/");
+
+// protected CrawlDbUpdateUtilNewAPI(T red, T.Context con) {
+ protected CrawlDbUpdateTestDriver(T updateReducer, Configuration conf) {
+ reducer = updateReducer;
+ configuration = conf;
+ }
+
+ /**
+ * run
+ * {@link CrawlDbReducer#reduce(Text, Iterator, OutputCollector, Reporter)}
+ * and return the CrawlDatum(s) which would have been written into CrawlDb
+ *
+ * @param values
+ * list of input CrawlDatums
+ * @return list of resulting CrawlDatum(s) in CrawlDb
+ */
+ public List<CrawlDatum> update(List<CrawlDatum> values) {
+ List<CrawlDatum> result = new ArrayList<CrawlDatum>(0);
+ if (values == null || values.size() == 0) {
+ return result;
+ }
+ Collections.shuffle(values); // sorting of values should have no influence
+ reduceDriver = ReduceDriver.newReduceDriver(reducer);
+ reduceDriver.setConfiguration(configuration);
+ reduceDriver.withInput(dummyURL, values);
+ List<Pair<Text,CrawlDatum>> reduceResult;
+ try {
+ reduceResult = reduceDriver.run();
+ for (Pair<Text,CrawlDatum> p : reduceResult) {
+ if (p.getFirst().equals(dummyURL)) {
+ result.add(p.getSecond());
+ }
+ }
+ } catch (IOException e) {
+ LOG.error(StringUtils.stringifyException(e));
+ return result;
+ }
+ return result;
+ }
+
+ /**
+ * run
+ * {@link CrawlDbReducer#reduce(Text, Iterator, OutputCollector, Reporter)}
+ * and return the CrawlDatum(s) which would have been written into CrawlDb
+ *
+ * @param dbDatum
+ * previous CrawlDatum in CrawlDb
+ * @param fetchDatum
+ * CrawlDatum resulting from fetching
+ * @return list of resulting CrawlDatum(s) in CrawlDb
+ */
+ public List<CrawlDatum> update(CrawlDatum dbDatum, CrawlDatum fetchDatum) {
+ List<CrawlDatum> values = new ArrayList<CrawlDatum>();
+ if (dbDatum != null)
+ values.add(dbDatum);
+ if (fetchDatum != null)
+ values.add(fetchDatum);
+ return update(values);
+ }
+
+ /**
+ * see {@link #update(List)}
+ */
+ public List<CrawlDatum> update(CrawlDatum... values) {
+ return update(Arrays.asList(values));
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/crawl/CrawlDbUpdateUtil.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/crawl/CrawlDbUpdateUtil.java b/nutch-core/src/test/java/org/apache/nutch/crawl/CrawlDbUpdateUtil.java
new file mode 100644
index 0000000..bfb716d
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/crawl/CrawlDbUpdateUtil.java
@@ -0,0 +1,166 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.Counters;
+import org.apache.hadoop.mapred.Counters.Counter;
+import org.apache.hadoop.mapred.InputSplit;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reducer;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.hadoop.util.StringUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Utility to test transitions of {@link CrawlDatum} states during an update of
+ * {@link CrawlDb} (command {@literal updatedb}): call
+ * {@link CrawlDbReducer#reduce(Text, Iterator, OutputCollector, Reporter)} with
+ * the old CrawlDatum (db status) and the new one (fetch status)
+ */
+public class CrawlDbUpdateUtil<T extends Reducer<Text, CrawlDatum, Text, CrawlDatum>> {
+
+ private static final Logger LOG = LoggerFactory
+ .getLogger(CrawlDbUpdateUtil.class);
+
+ private T reducer;
+
+ public static Text dummyURL = new Text("http://nutch.apache.org/");
+
+ protected CrawlDbUpdateUtil(T red, Configuration conf) {
+ reducer = red;
+ reducer.configure(new JobConf(conf));
+ }
+
+ /** {@link OutputCollector} to collect all values in a {@link List} */
+ private class ListOutputCollector implements
+ OutputCollector<Text, CrawlDatum> {
+
+ private List<CrawlDatum> values = new ArrayList<CrawlDatum>();
+
+ public void collect(Text key, CrawlDatum value) throws IOException {
+ values.add(value);
+ }
+
+ /** collected values as list */
+ public List<CrawlDatum> getValues() {
+ return values;
+ }
+
+ }
+
+ /**
+ * Dummy reporter which does nothing and does not return null for getCounter()
+ *
+ * @see {@link Reporter#NULL}
+ */
+ private class DummyReporter implements Reporter {
+
+ private Counters dummyCounters = new Counters();
+
+ public void progress() {
+ }
+
+ public Counter getCounter(Enum<?> arg0) {
+ return dummyCounters.getGroup("dummy").getCounterForName("dummy");
+ }
+
+ public Counter getCounter(String arg0, String arg1) {
+ return dummyCounters.getGroup("dummy").getCounterForName("dummy");
+ }
+
+ public InputSplit getInputSplit() throws UnsupportedOperationException {
+ throw new UnsupportedOperationException("Dummy reporter without input");
+ }
+
+ public void incrCounter(Enum<?> arg0, long arg1) {
+ }
+
+ public void incrCounter(String arg0, String arg1, long arg2) {
+ }
+
+ public void setStatus(String arg0) {
+ }
+
+ public float getProgress() {
+ return 1f;
+ }
+
+ }
+
+ /**
+ * run
+ * {@link CrawlDbReducer#reduce(Text, Iterator, OutputCollector, Reporter)}
+ * and return the CrawlDatum(s) which would have been written into CrawlDb
+ *
+ * @param values
+ * list of input CrawlDatums
+ * @return list of resulting CrawlDatum(s) in CrawlDb
+ */
+ public List<CrawlDatum> update(List<CrawlDatum> values) {
+ if (values == null || values.size() == 0) {
+ return new ArrayList<CrawlDatum>(0);
+ }
+ Collections.shuffle(values); // sorting of values should have no influence
+ ListOutputCollector output = new ListOutputCollector();
+ try {
+ reducer.reduce(dummyURL, values.iterator(), output, new DummyReporter());
+ } catch (IOException e) {
+ LOG.error(StringUtils.stringifyException(e));
+ }
+ return output.getValues();
+ }
+
+ /**
+ * run
+ * {@link CrawlDbReducer#reduce(Text, Iterator, OutputCollector, Reporter)}
+ * and return the CrawlDatum(s) which would have been written into CrawlDb
+ *
+ * @param dbDatum
+ * previous CrawlDatum in CrawlDb
+ * @param fetchDatum
+ * CrawlDatum resulting from fetching
+ * @return list of resulting CrawlDatum(s) in CrawlDb
+ */
+ public List<CrawlDatum> update(CrawlDatum dbDatum, CrawlDatum fetchDatum) {
+ List<CrawlDatum> values = new ArrayList<CrawlDatum>();
+ if (dbDatum != null)
+ values.add(dbDatum);
+ if (fetchDatum != null)
+ values.add(fetchDatum);
+ return update(values);
+ }
+
+ /**
+ * see {@link #update(List)}
+ */
+ public List<CrawlDatum> update(CrawlDatum... values) {
+ return update(Arrays.asList(values));
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/crawl/DummyWritable.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/crawl/DummyWritable.java b/nutch-core/src/test/java/org/apache/nutch/crawl/DummyWritable.java
new file mode 100644
index 0000000..94c27b5
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/crawl/DummyWritable.java
@@ -0,0 +1,32 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import org.apache.hadoop.io.IntWritable;
+
+public class DummyWritable extends IntWritable {
+
+ public DummyWritable() {
+
+ }
+
+ public DummyWritable(int i) {
+ super(i);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/crawl/TODOTestCrawlDbStates.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/crawl/TODOTestCrawlDbStates.java b/nutch-core/src/test/java/org/apache/nutch/crawl/TODOTestCrawlDbStates.java
new file mode 100644
index 0000000..fd88c7d
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/crawl/TODOTestCrawlDbStates.java
@@ -0,0 +1,171 @@
+package org.apache.nutch.crawl;
+
+import static org.apache.nutch.crawl.CrawlDatum.*;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.test.IntegrationTest;
+import org.apache.nutch.util.TimingUtil;
+
+import static org.junit.Assert.*;
+
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+@Category({ IntegrationTest.class})
+public class TODOTestCrawlDbStates extends TestCrawlDbStates {
+
+ private static final Logger LOG = LoggerFactory
+ .getLogger(TODOTestCrawlDbStates.class);
+
+ /**
+ * NUTCH-578: a fetch_retry should result in a db_gone if db.fetch.retry.max
+ * is reached. Retry counter has to be reset appropriately.
+ */
+ @Test
+ public void testCrawlDbReducerPageRetrySchedule() {
+ LOG.info("NUTCH-578: test long running continuous crawl with fetch_retry");
+ ContinuousCrawlTestUtil crawlUtil = new ContinuousCrawlTestFetchRetry();
+ // keep going for long, to "provoke" a retry counter overflow
+ if (!crawlUtil.run(150)) {
+ fail("fetch_retry did not result in a db_gone if retry counter > maxRetries (NUTCH-578)");
+ }
+ }
+
+ private class ContinuousCrawlTestFetchRetry extends ContinuousCrawlTestUtil {
+
+ private int retryMax = 3;
+ private int totalRetries = 0;
+
+ ContinuousCrawlTestFetchRetry() {
+ super();
+ fetchStatus = STATUS_FETCH_RETRY;
+ retryMax = configuration.getInt("db.fetch.retry.max", retryMax);
+ }
+
+ @Override
+ protected CrawlDatum fetch(CrawlDatum datum, long currentTime) {
+ datum.setStatus(fetchStatus);
+ datum.setFetchTime(currentTime);
+ totalRetries++;
+ return datum;
+ }
+
+ @Override
+ protected boolean check(CrawlDatum result) {
+ if (result.getRetriesSinceFetch() > retryMax) {
+ LOG.warn("Retry counter > db.fetch.retry.max: " + result);
+ } else if (result.getRetriesSinceFetch() == Byte.MAX_VALUE) {
+ LOG.warn("Retry counter max. value reached (overflow imminent): "
+ + result);
+ } else if (result.getRetriesSinceFetch() < 0) {
+ LOG.error("Retry counter overflow: " + result);
+ return false;
+ }
+ // use retry counter bound to this class (totalRetries)
+ // instead of result.getRetriesSinceFetch() because the retry counter
+ // in CrawlDatum could be reset (eg. NUTCH-578_v5.patch)
+ if (totalRetries < retryMax) {
+ if (result.getStatus() == STATUS_DB_UNFETCHED) {
+ LOG.info("ok: " + result);
+ result.getRetriesSinceFetch();
+ return true;
+ }
+ } else {
+ if (result.getStatus() == STATUS_DB_GONE) {
+ LOG.info("ok: " + result);
+ return true;
+ }
+ }
+ LOG.warn("wrong: " + result);
+ return false;
+ }
+
+ }
+
+ /**
+ * NUTCH-1564 AdaptiveFetchSchedule: sync_delta forces immediate re-fetch for
+ * documents not modified
+ * <p>
+ * Problem: documents not modified for a longer time are fetched in every
+ * cycle because of an error in the SYNC_DELTA calculation of
+ * {@link AdaptiveFetchSchedule}. <br>
+ * The next fetch time should always be in the future, never in the past.
+ * </p>
+ */
+ @Test
+ public void testAdaptiveFetchScheduleSyncDelta() {
+ LOG.info("NUTCH-1564 test SYNC_DELTA calculation of AdaptiveFetchSchedule");
+ Configuration conf = CrawlDBTestUtil.createConfiguration();
+ conf.setLong("db.fetch.interval.default", 172800); // 2 days
+ conf.setLong("db.fetch.schedule.adaptive.min_interval", 86400); // 1 day
+ conf.setLong("db.fetch.schedule.adaptive.max_interval", 604800); // 7 days
+ conf.setLong("db.fetch.interval.max", 604800); // 7 days
+ conf.set("db.fetch.schedule.class",
+ "org.apache.nutch.crawl.AdaptiveFetchSchedule");
+ ContinuousCrawlTestUtil crawlUtil = new CrawlTestFetchScheduleNotModifiedFetchTime(
+ conf);
+ crawlUtil.setInterval(FetchSchedule.SECONDS_PER_DAY / 3);
+ if (!crawlUtil.run(100)) {
+ fail("failed: sync_delta calculation with AdaptiveFetchSchedule");
+ }
+ }
+
+ private class CrawlTestFetchScheduleNotModifiedFetchTime extends
+ CrawlTestFetchNotModified {
+
+ // time of current fetch
+ private long fetchTime;
+
+ private long minInterval;
+ private long maxInterval;
+
+ CrawlTestFetchScheduleNotModifiedFetchTime(Configuration conf) {
+ super(conf);
+ minInterval = conf.getLong("db.fetch.schedule.adaptive.min_interval",
+ 86400); // 1 day
+ maxInterval = conf.getLong("db.fetch.schedule.adaptive.max_interval",
+ 604800); // 7 days
+ if (conf.getLong("db.fetch.interval.max", 604800) < maxInterval) {
+ maxInterval = conf.getLong("db.fetch.interval.max", 604800);
+ }
+ }
+
+ @Override
+ protected CrawlDatum fetch(CrawlDatum datum, long currentTime) {
+ // remember time of fetching
+ fetchTime = currentTime;
+ return super.fetch(datum, currentTime);
+ }
+
+ @Override
+ protected boolean check(CrawlDatum result) {
+ if (result.getStatus() == STATUS_DB_NOTMODIFIED) {
+ // check only status notmodified here
+ long secondsUntilNextFetch = (result.getFetchTime() - fetchTime) / 1000L;
+ if (secondsUntilNextFetch < -1) {
+ // next fetch time is in the past (more than one second)
+ LOG.error("Next fetch time is in the past: " + result);
+ return false;
+ }
+ if (secondsUntilNextFetch < 60) {
+ // next fetch time is in less than one minute
+ // (critical: Nutch can hardly be so fast)
+ LOG.error("Less then one minute until next fetch: " + result);
+ }
+ // Next fetch time should be within min. and max. (tolerance: 60 sec.)
+ if (secondsUntilNextFetch + 60 < minInterval
+ || secondsUntilNextFetch - 60 > maxInterval) {
+ LOG.error("Interval until next fetch time ("
+ + TimingUtil.elapsedTime(fetchTime, result.getFetchTime())
+ + ") is not within min. and max. interval: " + result);
+ // TODO: is this a failure?
+ }
+ }
+ return true;
+ }
+
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/crawl/TestAdaptiveFetchSchedule.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/crawl/TestAdaptiveFetchSchedule.java b/nutch-core/src/test/java/org/apache/nutch/crawl/TestAdaptiveFetchSchedule.java
new file mode 100644
index 0000000..3fa798d
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/crawl/TestAdaptiveFetchSchedule.java
@@ -0,0 +1,121 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.crawl;
+
+import junit.framework.TestCase;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Before;
+import org.junit.Test;
+
+/**
+ * Test cases for AdaptiveFetchSchedule.
+ *
+ */
+public class TestAdaptiveFetchSchedule extends TestCase {
+
+ private float inc_rate;
+ private float dec_rate;
+ private Configuration conf;
+ private long curTime, lastModified;
+ private int changed, interval, calculateInterval;
+
+ @Before
+ public void setUp() throws Exception {
+ super.setUp();
+ conf = NutchConfiguration.create();
+ inc_rate = conf.getFloat("db.fetch.schedule.adaptive.inc_rate", 0.2f);
+ dec_rate = conf.getFloat("db.fetch.schedule.adaptive.dec_rate", 0.2f);
+ interval = 100;
+ lastModified = 0;
+ }
+
+ /**
+ * Test the core functionality of AdaptiveFetchSchedule.
+ *
+ */
+
+ @Test
+ public void testAdaptiveFetchSchedule() {
+
+ FetchSchedule fs = new AdaptiveFetchSchedule();
+ fs.setConf(conf);
+
+ CrawlDatum p = prepareCrawlDatum();
+ Text url = new Text("http://www.example.com");
+
+ changed = FetchSchedule.STATUS_UNKNOWN;
+ fs.setFetchSchedule(url, p, p.getFetchTime(), p.getModifiedTime(), curTime,
+ lastModified, changed);
+ validateFetchInterval(changed, p.getFetchInterval());
+
+ changed = FetchSchedule.STATUS_MODIFIED;
+ fs.setFetchSchedule(url, p, p.getFetchTime(), p.getModifiedTime(), curTime,
+ lastModified, changed);
+ validateFetchInterval(changed, p.getFetchInterval());
+ p.setFetchInterval(interval);
+
+ changed = FetchSchedule.STATUS_NOTMODIFIED;
+ fs.setFetchSchedule(url, p, p.getFetchTime(), p.getModifiedTime(), curTime,
+ lastModified, changed);
+ validateFetchInterval(changed, p.getFetchInterval());
+
+ }
+
+ /**
+ * Prepare a CrawlDatum (STATUS_DB_UNFETCHED) to Test AdaptiveFetchSchedule.
+ *
+ * @return properly initialized CrawlDatum
+ */
+ public CrawlDatum prepareCrawlDatum() {
+ CrawlDatum p = new CrawlDatum();
+ p.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
+ p.setFetchInterval(interval);
+ p.setScore(1.0f);
+ p.setFetchTime(0);
+ return p;
+ }
+
+ /**
+ *
+ * The Method validates interval values according to changed parameter.
+ *
+ * @param changed
+ * status value to check calculated interval value.
+ * @param getInterval
+ * to test IntervalValue from CrawlDatum which is calculated via
+ * AdaptiveFetchSchedule algorithm.
+ */
+ private void validateFetchInterval(int changed, int getInterval) {
+
+ if (changed == FetchSchedule.STATUS_UNKNOWN) {
+ assertEquals(getInterval, interval);
+
+ } else if (changed == FetchSchedule.STATUS_MODIFIED) {
+ calculateInterval = (int) (interval - (interval * dec_rate));
+ assertEquals(getInterval, calculateInterval);
+
+ } else if (changed == FetchSchedule.STATUS_NOTMODIFIED) {
+ calculateInterval = (int) (interval + (interval * inc_rate));
+ assertEquals(getInterval, calculateInterval);
+ }
+
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbFilter.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbFilter.java b/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbFilter.java
new file mode 100644
index 0000000..773dd29
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbFilter.java
@@ -0,0 +1,148 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import java.io.IOException;
+import java.util.ArrayList;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.SequenceFile.Reader.Option;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.*;
+import org.apache.nutch.crawl.CrawlDBTestUtil.URLCrawlDatum;
+import org.apache.nutch.test.IntegrationTest;
+import org.apache.nutch.util.NutchJob;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+
+/**
+ * CrawlDbFiltering test which tests for correct, error free url normalization
+ * when the CrawlDB includes urls with <code>DB GONE</code> status and
+ * <code>CRAWLDB_PURGE_404</code> is set to true.
+ *
+ * @author lufeng
+ */
+public class TestCrawlDbFilter {
+ Configuration conf;
+ Path dbDir;
+ Path newCrawlDb;
+ final static Path testdir = new Path("build/test/crawldbfilter-test");
+ FileSystem fs;
+
+ @Before
+ public void setUp() throws Exception {
+ conf = CrawlDBTestUtil.createConfiguration();
+ fs = FileSystem.get(conf);
+ fs.delete(testdir, true);
+ }
+
+ @After
+ public void tearDown() {
+ delete(testdir);
+ }
+
+ private void delete(Path p) {
+ try {
+ fs.delete(p, true);
+ } catch (IOException e) {
+ }
+ }
+
+ /**
+ * Test url404Purging
+ *
+ * @throws Exception
+ */
+ @Test
+ @Category({IntegrationTest.class})
+ public void testUrl404Purging() throws Exception {
+ // create a CrawlDatum with DB GONE status
+ ArrayList<URLCrawlDatum> list = new ArrayList<URLCrawlDatum>();
+ list.add(new URLCrawlDatum(new Text("http://www.example.com"),
+ new CrawlDatum(CrawlDatum.STATUS_DB_GONE, 0, 0.0f)));
+ list.add(new URLCrawlDatum(new Text("http://www.example1.com"),
+ new CrawlDatum(CrawlDatum.STATUS_DB_FETCHED, 0, 0.0f)));
+ list.add(new URLCrawlDatum(new Text("http://www.example2.com"),
+ new CrawlDatum(CrawlDatum.STATUS_DB_UNFETCHED, 0, 0.0f)));
+ dbDir = new Path(testdir, "crawldb");
+ newCrawlDb = new Path(testdir, "newcrawldb");
+ // create crawldb
+ CrawlDBTestUtil.createCrawlDb(conf, fs, dbDir, list);
+ // set CRAWLDB_PURGE_404 to true
+ conf.setBoolean(CrawlDb.CRAWLDB_PURGE_404, true);
+ conf.setBoolean(CrawlDbFilter.URL_NORMALIZING, true);
+ conf.setBoolean(CrawlDbFilter.URL_FILTERING, false);
+ conf.setInt("urlnormalizer.loop.count", 2);
+ JobConf job = new NutchJob(conf);
+ job.setJobName("Test CrawlDbFilter");
+ Path current = new Path(dbDir, "current");
+ if (FileSystem.get(job).exists(current)) {
+ FileInputFormat.addInputPath(job, current);
+ }
+ job.setInputFormat(SequenceFileInputFormat.class);
+ job.setMapperClass(CrawlDbFilter.class);
+ job.setReducerClass(CrawlDbReducer.class);
+ FileOutputFormat.setOutputPath(job, newCrawlDb);
+ job.setOutputFormat(MapFileOutputFormat.class);
+ job.setOutputKeyClass(Text.class);
+ job.setOutputValueClass(CrawlDatum.class);
+ JobClient.runJob(job);
+
+ Path fetchlist = new Path(new Path(newCrawlDb, "part-00000"), "data");
+
+ ArrayList<URLCrawlDatum> l = readContents(fetchlist);
+
+ // verify we got right amount of records
+ Assert.assertEquals(2, l.size());
+ }
+
+ /**
+ * Read contents of fetchlist.
+ *
+ * @param fetchlist
+ * path to Generated fetchlist
+ * @return Generated {@link URLCrawlDatum} objects
+ * @throws IOException
+ */
+ private ArrayList<URLCrawlDatum> readContents(Path fetchlist)
+ throws IOException {
+ // verify results
+ Option fFile = SequenceFile.Reader.file(fetchlist);
+ SequenceFile.Reader reader = new SequenceFile.Reader(conf, fFile);
+
+ ArrayList<URLCrawlDatum> l = new ArrayList<URLCrawlDatum>();
+
+ READ: do {
+ Text key = new Text();
+ CrawlDatum value = new CrawlDatum();
+ if (!reader.next(key, value)) {
+ break READ;
+ }
+ l.add(new URLCrawlDatum(key, value));
+ } while (true);
+
+ reader.close();
+ return l;
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbMerger.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbMerger.java b/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbMerger.java
new file mode 100644
index 0000000..599c353
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbMerger.java
@@ -0,0 +1,163 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.TreeSet;
+import java.util.logging.Logger;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.MapFile;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.MapFile.Writer.Option;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.nutch.test.IntegrationTest;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+
+public class TestCrawlDbMerger {
+ private static final Logger LOG = Logger.getLogger(CrawlDbMerger.class
+ .getName());
+
+ String url10 = "http://example.com/";
+ String url11 = "http://example.com/foo";
+ String url20 = "http://example.com/";
+ String url21 = "http://example.com/bar";
+ String[] urls_expected = new String[] { url10, url11, url21 };
+
+ TreeSet<String> init1 = new TreeSet<String>();
+ TreeSet<String> init2 = new TreeSet<String>();
+ HashMap<String, CrawlDatum> expected = new HashMap<String, CrawlDatum>();
+ CrawlDatum cd1, cd2, cd3;
+ Configuration conf;
+ FileSystem fs;
+ Path testDir;
+ CrawlDbReader reader;
+
+ @Before
+ public void setUp() throws Exception {
+ init1.add(url10);
+ init1.add(url11);
+ init2.add(url20);
+ init2.add(url21);
+ long time = System.currentTimeMillis();
+ cd1 = new CrawlDatum();
+ cd1.setFetchInterval(1.0f);
+ cd1.setFetchTime(time);
+ cd1.getMetaData().put(new Text("name"), new Text("cd1"));
+ cd1.getMetaData().put(new Text("cd1"), new Text("cd1"));
+ cd2 = new CrawlDatum();
+ cd2.setFetchInterval(1.0f);
+ cd2.setFetchTime(time + 10000);
+ cd2.getMetaData().put(new Text("name"), new Text("cd2"));
+ cd3 = new CrawlDatum();
+ cd3.setFetchInterval(1.0f);
+ cd3.setFetchTime(time + 10000);
+ cd3.getMetaData().putAll(cd1.getMetaData());
+ cd3.getMetaData().putAll(cd2.getMetaData());
+ expected.put(url10, cd3);
+ expected.put(url11, cd1);
+ expected.put(url21, cd2);
+ conf = NutchConfiguration.create();
+ fs = FileSystem.get(conf);
+ testDir = new Path("test-crawldb-" + new java.util.Random().nextInt());
+ fs.mkdirs(testDir);
+ }
+
+ @After
+ public void tearDown() {
+ try {
+ if (fs.exists(testDir))
+ fs.delete(testDir, true);
+ } catch (Exception e) {
+ }
+ try {
+ reader.close();
+ } catch (Exception e) {
+ }
+ }
+
+ /**
+ * Test creates two sample {@link org.apache.nutch.crawl.CrawlDb}'s
+ * populating entries for keys as {@link org.apache.hadoop.io.Text} e.g. URLs
+ * and values as {@link org.apache.nutch.crawl.CrawlDatum} e.g. record data.
+ * It then simulates a merge process for the two CrawlDb's via the {@link org.apache.nutch.crawl.CrawlDbMerger}
+ * tool. The merged CrawlDb is then written to an arbitrary output location and the results
+ * read using the {@link org.apache.nutch.crawl.CrawlDbReader} tool.
+ * Test assertions include comparing expected CrawlDb key, value (URL, CrawlDatum) values
+ * with actual results based on the merge process.
+ * @throws Exception
+ */
+ @Test
+ @Category({IntegrationTest.class})
+ public void testMerge() throws Exception {
+ Path crawldb1 = new Path(testDir, "crawldb1");
+ Path crawldb2 = new Path(testDir, "crawldb2");
+ Path output = new Path(testDir, "output");
+ createCrawlDb(conf, fs, crawldb1, init1, cd1);
+ createCrawlDb(conf, fs, crawldb2, init2, cd2);
+ CrawlDbMerger merger = new CrawlDbMerger(conf);
+ LOG.fine("* merging crawldbs to " + output);
+ merger.merge(output, new Path[] { crawldb1, crawldb2 }, false, false);
+ LOG.fine("* reading crawldb: " + output);
+ reader = new CrawlDbReader();
+ String crawlDb = output.toString();
+ Iterator<String> it = expected.keySet().iterator();
+ while (it.hasNext()) {
+ String url = it.next();
+ LOG.fine("url=" + url);
+ CrawlDatum cd = expected.get(url);
+ CrawlDatum res = reader.get(crawlDb, url, new JobConf(conf));
+ LOG.fine(" -> " + res);
+ System.out.println("url=" + url);
+ System.out.println(" cd " + cd);
+ System.out.println(" res " + res);
+ // may not be null
+ Assert.assertNotNull(res);
+ Assert.assertTrue(cd.equals(res));
+ }
+ reader.close();
+ fs.delete(testDir, true);
+ }
+
+ private void createCrawlDb(Configuration config, FileSystem fs, Path crawldb,
+ TreeSet<String> init, CrawlDatum cd) throws Exception {
+ LOG.fine("* creating crawldb: " + crawldb);
+ Path dir = new Path(crawldb, CrawlDb.CURRENT_NAME);
+
+ Option wKeyOpt = MapFile.Writer.keyClass(Text.class);
+ org.apache.hadoop.io.SequenceFile.Writer.Option wValueOpt = SequenceFile.Writer.valueClass(CrawlDatum.class);
+
+ MapFile.Writer writer = new MapFile.Writer(config, new Path(dir,
+ "part-r-00000"), wKeyOpt, wValueOpt);
+ Iterator<String> it = init.iterator();
+ while (it.hasNext()) {
+ String key = it.next();
+ writer.append(new Text(key), cd);
+ }
+ writer.close();
+ }
+}