You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2016/04/07 17:16:09 UTC
[1/2] nutch git commit: NUTCH-2222 re-fetch deletes all metadata
except _csh_ and _rs_
Repository: nutch
Updated Branches:
refs/heads/2.x 1e65c3f6b -> d868f06cf
NUTCH-2222 re-fetch deletes all metadata except _csh_ and _rs_
Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/fd478448
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/fd478448
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/fd478448
Branch: refs/heads/2.x
Commit: fd478448bb4a68ad53520ae7b325204a7834782a
Parents: 3e80673
Author: Lewis John McGibbney <le...@jpl.nasa.gov>
Authored: Mon Mar 21 20:46:18 2016 -0700
Committer: Lewis John McGibbney <le...@jpl.nasa.gov>
Committed: Mon Mar 21 20:46:18 2016 -0700
----------------------------------------------------------------------
.../org/apache/nutch/crawl/GeneratorJob.java | 14 +++-
src/test/nutch-site.xml | 7 ++
.../org/apache/nutch/fetcher/TestFetcher.java | 84 ++++++++++++++++++--
3 files changed, 94 insertions(+), 11 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/nutch/blob/fd478448/src/java/org/apache/nutch/crawl/GeneratorJob.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/crawl/GeneratorJob.java b/src/java/org/apache/nutch/crawl/GeneratorJob.java
index aae2ba9..e06a192 100644
--- a/src/java/org/apache/nutch/crawl/GeneratorJob.java
+++ b/src/java/org/apache/nutch/crawl/GeneratorJob.java
@@ -240,9 +240,17 @@ public class GeneratorJob extends NutchTool implements Tool {
/**
* Mark URLs ready for fetching.
- *
- * @throws ClassNotFoundException
- * @throws InterruptedException
+ * @param topN
+ * top threshold for maximum number of URLs permitted in a batch
+ * @param curTime
+ * the current time in milliseconds
+ * @param filter
+ * optional filtering of URLs within the generated batch
+ * @param norm
+ * optional normalization of URls within the generated batch
+ * @param sitemap
+ * flag indicating whether a URL is a sitemap and hence processed accordingly
+ * @throws Exception
* */
public String generate(long topN, long curTime, boolean filter, boolean norm,
boolean sitemap) throws Exception {
http://git-wip-us.apache.org/repos/asf/nutch/blob/fd478448/src/test/nutch-site.xml
----------------------------------------------------------------------
diff --git a/src/test/nutch-site.xml b/src/test/nutch-site.xml
index 4f3ced4..e599547 100644
--- a/src/test/nutch-site.xml
+++ b/src/test/nutch-site.xml
@@ -22,4 +22,11 @@
<description>Default in-memory datastore class for temp test data.</description>
</property>
+<property>
+ <name>db.fetch.interval.default</name>
+ <value>1</value>
+ <description>The default number of seconds between re-fetches of a page (30 days).
+ </description>
+</property>
+
</configuration>
http://git-wip-us.apache.org/repos/asf/nutch/blob/fd478448/src/test/org/apache/nutch/fetcher/TestFetcher.java
----------------------------------------------------------------------
diff --git a/src/test/org/apache/nutch/fetcher/TestFetcher.java b/src/test/org/apache/nutch/fetcher/TestFetcher.java
index 2411a61..8a8fa42 100644
--- a/src/test/org/apache/nutch/fetcher/TestFetcher.java
+++ b/src/test/org/apache/nutch/fetcher/TestFetcher.java
@@ -23,9 +23,12 @@ import java.util.List;
import java.util.Map;
import org.apache.hadoop.fs.Path;
+import org.apache.nutch.crawl.DbUpdaterJob;
import org.apache.nutch.crawl.GeneratorJob;
import org.apache.nutch.crawl.InjectorJob;
import org.apache.nutch.crawl.URLWebPage;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.parse.ParserJob;
import org.apache.nutch.protocol.Protocol;
import org.apache.nutch.protocol.ProtocolFactory;
import org.apache.nutch.storage.Mark;
@@ -34,22 +37,24 @@ import org.apache.nutch.util.AbstractNutchTest;
import org.apache.nutch.util.Bytes;
import org.apache.nutch.util.CrawlTestUtil;
import org.mortbay.jetty.Server;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
import crawlercommons.robots.BaseRobotRules;
import org.junit.After;
import org.junit.Before;
-import org.junit.Ignore;
import org.junit.Test;
import static org.junit.Assert.*;
/**
- * Basic fetcher test 1. generate seedlist 2. inject 3. generate 3. fetch 4.
- * Verify contents
- *
+ * Various fetcher tests which test fetching, refetching, sitemap fetching
+ * sitemap detection and the basic verification of a agent name check.
*/
public class TestFetcher extends AbstractNutchTest {
+ private static final Logger LOG = LoggerFactory.getLogger(AbstractNutchTest.class);
+
final static Path testdir = new Path("build/test/fetch-test");
Path urlPath;
Server server;
@@ -58,6 +63,7 @@ public class TestFetcher extends AbstractNutchTest {
@Before
public void setUp() throws Exception {
super.setUp();
+ conf.setBoolean(FetcherJob.PARSE_KEY, true);
urlPath = new Path(testdir, "urls");
server = CrawlTestUtil.getServer(conf.getInt("content.server.port", 50000),
"build/test/data/fetch-test-site");
@@ -117,7 +123,6 @@ public class TestFetcher extends AbstractNutchTest {
// fetch
time = System.currentTimeMillis();
- conf.setBoolean(FetcherJob.PARSE_KEY, true);
FetcherJob fetcher = new FetcherJob(conf);
fetcher.fetch(batchId, 1, false, -1);
@@ -154,6 +159,68 @@ public class TestFetcher extends AbstractNutchTest {
}
/**
+ * Tests a refetch of a URL. This process consists of two consecutive
+ * inject, generate, fetch, parse then update cycles. The test configuration
+ * is defined such that <code>db.fetch.interval.default</code> is set to
+ * a very low value (indicating that the URL should be fetched again immediately).
+ * In addition, configuration tests that relevant
+ * {@link org.apache.nutch.metadata.Metadata} is present and the values consistent
+ * and therefore not overwritten.
+ * @see https://issues.apache.org/jira/browse/NUTCH-2222
+ * @throws Exception
+ */
+ @Test
+ public void testReFetch() throws Exception {
+
+ // generate seedlist
+ ArrayList<String> urls = new ArrayList<String>();
+ // inject
+ addUrl(urls, "index.html");
+ CrawlTestUtil.generateSeedList(fs, urlPath, urls);
+
+ InjectorJob injector = new InjectorJob(conf);
+ injector.inject(urlPath);
+
+ // crawl 1
+ long time = System.currentTimeMillis();
+ GeneratorJob g = new GeneratorJob(conf);
+ String batchId = g.generate(Long.MAX_VALUE, time, false, false, false);
+ FetcherJob fetcher = new FetcherJob(conf);
+ fetcher.fetch(Nutch.ALL_BATCH_ID_STR, 1, false, -1);
+ ParserJob parser = new ParserJob(conf);
+ parser.parse(Nutch.ALL_BATCH_ID_STR, true, true);
+ URLWebPage up = CrawlTestUtil.readContents(webPageStore, Mark.FETCH_MARK, (String[]) null).get(0);
+ assertEquals(urls.size(), 1);
+ int countMetaDatasFetch1 = up.getDatum().getMetadata().size();
+ DbUpdaterJob updateter = new DbUpdaterJob(conf);
+ updateter.run(new String[]{Nutch.ALL_BATCH_ID_STR});
+
+
+ Thread.sleep(10000);
+
+ // crawl 2
+ CrawlTestUtil.generateSeedList(fs, urlPath, urls);
+ injector = new InjectorJob(conf);
+ injector.inject(urlPath);
+ g = new GeneratorJob(conf);
+ time = System.currentTimeMillis();
+ batchId = g.generate(Long.MAX_VALUE, time, false, false, false);
+ fetcher = new FetcherJob(conf);
+ fetcher.fetch(Nutch.ALL_BATCH_ID_STR, 1, false, -1);
+ parser = new ParserJob(conf);
+ parser.parse(Nutch.ALL_BATCH_ID_STR, true, true);
+ updateter = new DbUpdaterJob(conf);
+ updateter.run(new String[]{Nutch.ALL_BATCH_ID_STR});
+ up = CrawlTestUtil.readContents(webPageStore, null, (String[]) null).get(0);
+ assertEquals(urls.size(), 1);
+ int countMetaDatasFetch2 = up.getDatum().getMetadata().size();
+
+ LOG.info("countMetaDatas Fetch1 : {}", countMetaDatasFetch1);
+ LOG.info("countMetaDatas Fetch2 : {}", countMetaDatasFetch2);
+ assertEquals(countMetaDatasFetch1, countMetaDatasFetch2);
+ }
+
+ /**
* Test that only sitemap page fetcher
*
* @throws Exception
@@ -201,7 +268,6 @@ public class TestFetcher extends AbstractNutchTest {
// generate for only sitemap
g.generate(Long.MAX_VALUE, time, false, false, true);
- conf.setBoolean(FetcherJob.PARSE_KEY, true);
FetcherJob fetcher = new FetcherJob(conf);
// for only sitemap fetch
@@ -265,7 +331,6 @@ public class TestFetcher extends AbstractNutchTest {
g.generate(Long.MAX_VALUE, time, false, false, false);
- conf.setBoolean(FetcherJob.PARSE_KEY, true);
FetcherJob fetcher = new FetcherJob(conf);
// for only sitemap fetch
@@ -287,6 +352,10 @@ public class TestFetcher extends AbstractNutchTest {
}
}
+ /**
+ * Maps a webpage to the local Jetty server address so that it can
+ * be fetched as part of an arraylist
+ */
private void addUrl(ArrayList<String> urls, String page) {
urls.add("http://127.0.0.1:" + server.getConnectors()[0].getPort() + "/"
+ page);
@@ -299,7 +368,6 @@ public class TestFetcher extends AbstractNutchTest {
conf.set("http.agent.name", "");
try {
- conf.setBoolean(FetcherJob.PARSE_KEY, true);
FetcherJob fetcher = new FetcherJob(conf);
fetcher.checkConfiguration();
} catch (IllegalArgumentException iae) {
[2/2] nutch git commit: Merge branch 'NUTCH-2222' into 2.x this
closes #99
Posted by le...@apache.org.
Merge branch 'NUTCH-2222' into 2.x this closes #99
Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/d868f06c
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/d868f06c
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/d868f06c
Branch: refs/heads/2.x
Commit: d868f06cfbb2c3e5a9d3496687d87fe76d667e4c
Parents: 1e65c3f fd47844
Author: Lewis John McGibbney <le...@jpl.nasa.gov>
Authored: Thu Apr 7 08:20:05 2016 -0700
Committer: Lewis John McGibbney <le...@jpl.nasa.gov>
Committed: Thu Apr 7 08:20:05 2016 -0700
----------------------------------------------------------------------
.../org/apache/nutch/crawl/GeneratorJob.java | 14 +++-
src/test/nutch-site.xml | 7 ++
.../org/apache/nutch/fetcher/TestFetcher.java | 84 ++++++++++++++++++--
3 files changed, 94 insertions(+), 11 deletions(-)
----------------------------------------------------------------------