You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by th...@apache.org on 2016/07/16 19:48:45 UTC
[29/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build
for nutch-core and nutch-plugins
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbStates.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbStates.java b/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbStates.java
new file mode 100644
index 0000000..b631319
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbStates.java
@@ -0,0 +1,569 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import java.util.ArrayList;
+import java.util.Date;
+import java.util.Iterator;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.util.StringUtils;
+
+import org.apache.nutch.crawl.CrawlDatum;
+
+import static org.apache.nutch.crawl.CrawlDatum.*;
+
+import org.apache.nutch.scoring.ScoringFilterException;
+import org.apache.nutch.scoring.ScoringFilters;
+
+import static org.junit.Assert.*;
+
+import org.apache.nutch.test.IntegrationTest;
+import org.junit.Test;
+
+import org.junit.experimental.categories.Category;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Test transitions of {@link CrawlDatum} states during an update of
+ * {@link CrawlDb} (command {@literal updatedb}):
+ * <ul>
+ * <li>simulate updatedb with the old CrawlDatum (db status) and the new one
+ * (fetch status) and test whether the resulting CrawlDatum has the appropriate
+ * status.</li>
+ * <li>also check for further CrawlDatum fields (signature, etc.)</li>
+ * <li>and additional conditions:</li>
+ * <ul>
+ * <li>retry counters</li>
+ * <li>signatures</li>
+ * <li>configuration properties</li>
+ * <li>(additional) CrawlDatums of status linked (stemming from inlinks)</li>
+ * </ul>
+ * </li> </ul>
+ */
+@Category({IntegrationTest.class})
+public class TestCrawlDbStates {
+
+ private static final Logger LOG = LoggerFactory
+ .getLogger(TestCrawlDbStates.class);
+
+ protected static final byte[][] fetchDbStatusPairs = {
+ { -1, STATUS_DB_UNFETCHED }, { STATUS_FETCH_SUCCESS, STATUS_DB_FETCHED },
+ { STATUS_FETCH_GONE, STATUS_DB_GONE },
+ { STATUS_FETCH_REDIR_TEMP, STATUS_DB_REDIR_TEMP },
+ { STATUS_FETCH_REDIR_PERM, STATUS_DB_REDIR_PERM },
+ { STATUS_FETCH_NOTMODIFIED, STATUS_DB_NOTMODIFIED },
+ { STATUS_FETCH_RETRY, -1 }, // fetch_retry does not have a CrawlDb
+ // counter-part
+ { -1, STATUS_DB_DUPLICATE }, };
+
+ /** tested {@link FetchSchedule} implementations */
+ protected String[] schedules = { "DefaultFetchSchedule",
+ "AdaptiveFetchSchedule" };
+
+ /** CrawlDatum as result of a link */
+ protected final CrawlDatum linked = new CrawlDatum(STATUS_LINKED,
+ CrawlDBTestUtil.createConfiguration().getInt("db.fetch.interval.default",
+ 2592000), 0.1f);
+
+ /**
+ * Test the matrix of state transitions:
+ * <ul>
+ * <li>for all available {@link FetchSchedule} implementations</li>
+ * <li>for every possible status in CrawlDb (including "not in CrawlDb")</li>
+ * <li>for every possible fetch status</li>
+ * <li>and zero or more (0-3) additional in-links</li>
+ * </ul>
+ * call {@literal updatedb} and check whether the resulting CrawlDb status is
+ * the expected one.
+ */
+ @Test
+ public void testCrawlDbStateTransitionMatrix() {
+ LOG.info("Test CrawlDatum state transitions");
+ Configuration conf = CrawlDBTestUtil.createConfiguration();
+ CrawlDbUpdateUtil<CrawlDbReducer> updateDb = new CrawlDbUpdateUtil<CrawlDbReducer>(
+ new CrawlDbReducer(), conf);
+ int retryMax = conf.getInt("db.fetch.retry.max", 3);
+ for (String sched : schedules) {
+ LOG.info("Testing state transitions with " + sched);
+ conf.set("db.fetch.schedule.class", "org.apache.nutch.crawl." + sched);
+ FetchSchedule schedule = FetchScheduleFactory
+ .getFetchSchedule(new JobConf(conf));
+ for (int i = 0; i < fetchDbStatusPairs.length; i++) {
+ byte fromDbStatus = fetchDbStatusPairs[i][1];
+ for (int j = 0; j < fetchDbStatusPairs.length; j++) {
+ byte fetchStatus = fetchDbStatusPairs[j][0];
+ CrawlDatum fromDb = null;
+ if (fromDbStatus == -1) {
+ // nothing yet in CrawlDb
+ // CrawlDatum added by FreeGenerator or via outlink
+ } else {
+ fromDb = new CrawlDatum();
+ fromDb.setStatus(fromDbStatus);
+ // initialize fetchInterval:
+ schedule.initializeSchedule(CrawlDbUpdateUtil.dummyURL, fromDb);
+ }
+ // expected db status
+ byte toDbStatus = fetchDbStatusPairs[j][1];
+ if (fetchStatus == -1) {
+ if (fromDbStatus == -1) {
+ // nothing fetched yet: new document detected via outlink
+ toDbStatus = STATUS_DB_UNFETCHED;
+ } else {
+ // nothing fetched but new inlinks detected: status is unchanged
+ toDbStatus = fromDbStatus;
+ }
+ } else if (fetchStatus == STATUS_FETCH_RETRY) {
+ // a simple test of fetch_retry (without retries)
+ if (fromDb == null || fromDb.getRetriesSinceFetch() < retryMax) {
+ toDbStatus = STATUS_DB_UNFETCHED;
+ } else {
+ toDbStatus = STATUS_DB_GONE;
+ }
+ }
+ String fromDbStatusName = (fromDbStatus == -1 ? "<not in CrawlDb>"
+ : getStatusName(fromDbStatus));
+ String fetchStatusName = (fetchStatus == -1 ? "<only inlinks>"
+ : CrawlDatum.getStatusName(fetchStatus));
+ LOG.info(fromDbStatusName + " + " + fetchStatusName + " => "
+ + getStatusName(toDbStatus));
+ List<CrawlDatum> values = new ArrayList<CrawlDatum>();
+ for (int l = 0; l <= 2; l++) { // number of additional in-links
+ CrawlDatum fetch = null;
+ if (fetchStatus == -1) {
+ // nothing fetched, need at least one in-link
+ if (l == 0)
+ continue;
+ } else {
+ fetch = new CrawlDatum();
+ if (fromDb != null) {
+ fetch.set(fromDb);
+ } else {
+ // not yet in CrawlDb: added by FreeGenerator
+ schedule.initializeSchedule(CrawlDbUpdateUtil.dummyURL, fetch);
+ }
+ fetch.setStatus(fetchStatus);
+ fetch.setFetchTime(System.currentTimeMillis());
+ }
+ if (fromDb != null)
+ values.add(fromDb);
+ if (fetch != null)
+ values.add(fetch);
+ for (int n = 0; n < l; n++) {
+ values.add(linked);
+ }
+ List<CrawlDatum> res = updateDb.update(values);
+ if (res.size() != 1) {
+ fail("CrawlDb update didn't result in one single CrawlDatum per URL");
+ continue;
+ }
+ byte status = res.get(0).getStatus();
+ if (status != toDbStatus) {
+ fail("CrawlDb update for " + fromDbStatusName + " and "
+ + fetchStatusName + " and " + l + " inlinks results in "
+ + getStatusName(status) + " (expected: "
+ + getStatusName(toDbStatus) + ")");
+ }
+ values.clear();
+ }
+ }
+ }
+ }
+ }
+
+ /**
+ * Test states after inject: inject must not modify the status of CrawlDatums
+ * already in CrawlDb. Newly injected elements have status "db_unfetched".
+ * Inject is simulated by calling {@link Injector.InjectReducer#reduce()}.
+ */
+ @Test
+ public void testCrawlDbStatTransitionInject() {
+ LOG.info("Test CrawlDatum states in Injector after inject");
+ Configuration conf = CrawlDBTestUtil.createConfiguration();
+ Injector.InjectReducer injector = new Injector.InjectReducer();
+ CrawlDbUpdateTestDriver<Injector.InjectReducer> injectDriver =
+ new CrawlDbUpdateTestDriver<Injector.InjectReducer>(injector, conf);
+ ScoringFilters scfilters = new ScoringFilters(conf);
+ for (String sched : schedules) {
+ LOG.info("Testing inject with " + sched);
+ conf.set("db.fetch.schedule.class", "org.apache.nutch.crawl." + sched);
+ FetchSchedule schedule = FetchScheduleFactory
+ .getFetchSchedule(new JobConf(conf));
+ List<CrawlDatum> values = new ArrayList<CrawlDatum>();
+ for (int i = 0; i < fetchDbStatusPairs.length; i++) {
+ byte fromDbStatus = fetchDbStatusPairs[i][1];
+ byte toDbStatus = fromDbStatus;
+ if (fromDbStatus == -1) {
+ toDbStatus = STATUS_DB_UNFETCHED;
+ } else {
+ CrawlDatum fromDb = new CrawlDatum();
+ fromDb.setStatus(fromDbStatus);
+ schedule.initializeSchedule(CrawlDbUpdateUtil.dummyURL, fromDb);
+ values.add(fromDb);
+ }
+ LOG.info("inject "
+ + (fromDbStatus == -1 ? "<not in CrawlDb>" : CrawlDatum
+ .getStatusName(fromDbStatus)) + " + "
+ + getStatusName(STATUS_INJECTED) + " => "
+ + getStatusName(toDbStatus));
+ CrawlDatum injected = new CrawlDatum(STATUS_INJECTED, conf.getInt(
+ "db.fetch.interval.default", 2592000), 0.1f);
+ schedule.initializeSchedule(CrawlDbUpdateUtil.dummyURL, injected);
+ try {
+ scfilters.injectedScore(CrawlDbUpdateUtil.dummyURL, injected);
+ } catch (ScoringFilterException e) {
+ LOG.error(StringUtils.stringifyException(e));
+ }
+ values.add(injected);
+ List<CrawlDatum> res = injectDriver.update(values);
+ if (res.size() != 1) {
+ fail("Inject didn't result in one single CrawlDatum per URL");
+ continue;
+ }
+ byte status = res.get(0).getStatus();
+ if (status != toDbStatus) {
+ fail("Inject for "
+ + (fromDbStatus == -1 ? "" : getStatusName(fromDbStatus)
+ + " and ") + getStatusName(STATUS_INJECTED) + " results in "
+ + getStatusName(status) + " (expected: "
+ + getStatusName(toDbStatus) + ")");
+ }
+ values.clear();
+ }
+ }
+ }
+
+ /**
+ * Test status db_notmodified detected by
+ * <ul>
+ * <li>signature comparison</li>
+ * <li>or HTTP 304</li>
+ * </ul>
+ * In addition, test for all available {@link FetchSchedule} implementations
+ * whether
+ * <ul>
+ * <li>modified time is set</li>
+ * <li>re-fetch is triggered after a certain time to force the fetched content
+ * to be in a recent segment (old segments are deleted, see comments in
+ * {@link CrawlDbReducer#reduce(Text, Iterator, OutputCollector, Reporter)}</li>
+ * </ul>
+ */
+ @Test
+ public void testCrawlDbReducerNotModified() {
+ LOG.info("Test state notmodified");
+ Configuration conf = CrawlDBTestUtil.createConfiguration();
+ // test not modified detected by signature comparison
+ for (String sched : schedules) {
+ String desc = "test notmodified by signature comparison + " + sched;
+ LOG.info(desc);
+ conf.set("db.fetch.schedule.class", "org.apache.nutch.crawl." + sched);
+ ContinuousCrawlTestUtil crawlUtil = new CrawlTestFetchNotModified(conf);
+ if (!crawlUtil.run(20)) {
+ fail("failed: " + desc);
+ }
+ }
+ // test not modified detected by HTTP 304
+ for (String sched : schedules) {
+ String desc = "test notmodified by HTTP 304 + " + sched;
+ LOG.info(desc);
+ conf.set("db.fetch.schedule.class", "org.apache.nutch.crawl." + sched);
+ ContinuousCrawlTestUtil crawlUtil = new CrawlTestFetchNotModifiedHttp304(
+ conf);
+ if (!crawlUtil.run(20)) {
+ fail("failed: " + desc);
+ }
+ }
+ }
+
+ protected class CrawlTestFetchNotModified extends ContinuousCrawlTestUtil {
+
+ /** time of the current fetch */
+ protected long currFetchTime;
+ /** time the last fetch took place */
+ protected long lastFetchTime;
+ /**
+ * time the document was fetched first (at all or after it has been changed)
+ */
+ protected long firstFetchTime;
+ /** state in CrawlDb before the last fetch */
+ protected byte previousDbState;
+ /** signature in CrawlDb of previous fetch */
+ protected byte[] lastSignature;
+
+ private long maxFetchInterval;
+ private FetchSchedule schedule;
+
+ CrawlTestFetchNotModified(Configuration conf) {
+ super(conf);
+ maxFetchInterval = conf.getLong("db.fetch.interval.max", 7776000); // default
+ // = 90
+ // days
+ maxFetchInterval += (24 * 60 * 60); // but take one day more to avoid
+ // false alarms
+ maxFetchInterval *= 1000; // in milli-seconds
+ schedule = FetchScheduleFactory.getFetchSchedule(new JobConf(conf));
+ }
+
+ @Override
+ protected boolean check(CrawlDatum result) {
+ if (lastFetchTime > 0
+ && (currFetchTime - lastFetchTime) > maxFetchInterval) {
+ LOG.error("last effective fetch (HTTP 200, not HTTP 304), at "
+ + new Date(lastFetchTime)
+ + ", took place more than db.fetch.interval.max time, "
+ + "segment containing fetched content may have been deleted");
+ return false;
+ }
+ switch (result.getStatus()) {
+ case STATUS_DB_NOTMODIFIED:
+ // db_notmodified is correct if the document has been fetched previously
+ // and it has not been changed since
+ if ((previousDbState == STATUS_DB_FETCHED || previousDbState == STATUS_DB_NOTMODIFIED)) {
+ if (lastSignature != null
+ && result.getSignature() != null
+ && SignatureComparator._compare(lastSignature,
+ result.getSignature()) != 0) {
+ LOG.error("document has changed (signature changed) but state is still "
+ + getStatusName(STATUS_DB_NOTMODIFIED));
+ return false;
+ }
+ LOG.info("ok: " + result);
+ return checkModifiedTime(result, firstFetchTime);
+ }
+ LOG.warn("notmodified without previous fetch");
+ break;
+ case STATUS_DB_FETCHED:
+ if (previousDbState == STATUS_DB_UNFETCHED) {
+ LOG.info("ok (first fetch): " + result);
+ return checkModifiedTime(result, firstFetchTime);
+ } else if (lastSignature != null
+ && result.getSignature() != null
+ && SignatureComparator._compare(lastSignature,
+ result.getSignature()) != 0) {
+ LOG.info("ok (content changed): " + result);
+ // expect modified time == now
+ return checkModifiedTime(result, currFetchTime);
+ } else {
+ LOG.warn("document has not changed, db_notmodified expected");
+ }
+ break;
+ case STATUS_DB_UNFETCHED:
+ /**
+ * Status db_unfetched is possible with {@link AdaptiveFetchSchedule}
+ * because {@link CrawlDbReducer#reduce} calls
+ * {@link FetchSchedule#forceRefetch} to force a re-fetch if fetch
+ * interval grows too large.
+ */
+ if (schedule.getClass() == AdaptiveFetchSchedule.class) {
+ LOG.info("state set to unfetched by AdaptiveFetchSchedule");
+ if (result.getSignature() != null) {
+ LOG.warn("must reset signature: " + result);
+ return false;
+ }
+ LOG.info("ok: " + result);
+ firstFetchTime = 0;
+ return true;
+ }
+ }
+ LOG.warn("wrong result: " + result);
+ return false;
+ }
+
+ // test modified time
+ private boolean checkModifiedTime(CrawlDatum result, long modifiedTime) {
+ if (result.getModifiedTime() == 0) {
+ LOG.error("modified time not set (TODO: not set by DefaultFetchSchedule)");
+ // TODO: return false (but DefaultFetchSchedule does not set modified
+ // time, see NUTCH-933)
+ return true;
+ } else if (modifiedTime == result.getModifiedTime()) {
+ return true;
+ }
+ LOG.error("wrong modified time: " + new Date(result.getModifiedTime())
+ + " (expected " + new Date(modifiedTime) + ")");
+ return false;
+ }
+
+ @Override
+ protected CrawlDatum fetch(CrawlDatum datum, long currentTime) {
+ lastFetchTime = currFetchTime;
+ currFetchTime = currentTime;
+ previousDbState = datum.getStatus();
+ lastSignature = datum.getSignature();
+ datum = super.fetch(datum, currentTime);
+ if (firstFetchTime == 0) {
+ firstFetchTime = currFetchTime;
+ } else if ((currFetchTime - firstFetchTime) > (duration / 2)) {
+ // simulate a modification after "one year"
+ changeContent();
+ firstFetchTime = currFetchTime;
+ }
+ return datum;
+ }
+ }
+
+ protected class CrawlTestFetchNotModifiedHttp304 extends
+ CrawlTestFetchNotModified {
+
+ CrawlTestFetchNotModifiedHttp304(Configuration conf) {
+ super(conf);
+ }
+
+ @Override
+ protected CrawlDatum fetch(CrawlDatum datum, long currentTime) {
+ lastFetchTime = currFetchTime;
+ currFetchTime = currentTime;
+ previousDbState = datum.getStatus();
+ lastSignature = datum.getSignature();
+ int httpCode;
+ /*
+ * document is "really" fetched (no HTTP 304) - if last-modified time or
+ * signature are unset (page has not been fetched before or fetch is
+ * forced) - for test purposes, we simulate a modified after "one year"
+ */
+ if (datum.getModifiedTime() == 0 && datum.getSignature() == null
+ || (currFetchTime - firstFetchTime) > (duration / 2)) {
+ firstFetchTime = currFetchTime;
+ httpCode = 200;
+ datum.setStatus(STATUS_FETCH_SUCCESS);
+ // modify content to change signature
+ changeContent();
+ } else {
+ httpCode = 304;
+ datum.setStatus(STATUS_FETCH_NOTMODIFIED);
+ }
+ LOG.info("fetched with HTTP " + httpCode + " => "
+ + getStatusName(datum.getStatus()));
+ datum.setFetchTime(currentTime);
+ return datum;
+ }
+ }
+
+ /**
+ * NUTCH-1245: a fetch_gone should always result in a db_gone.
+ * <p>
+ * Even in a long-running continuous crawl, when a gone page is re-fetched
+ * several times over time.
+ * </p>
+ */
+ @Test
+ public void testCrawlDbReducerPageGoneSchedule1() {
+ LOG.info("NUTCH-1245: test long running continuous crawl");
+ ContinuousCrawlTestUtil crawlUtil = new ContinuousCrawlTestUtil(
+ STATUS_FETCH_GONE, STATUS_DB_GONE);
+ if (!crawlUtil.run(20)) {
+ fail("fetch_gone did not result in a db_gone (NUTCH-1245)");
+ }
+ }
+
+ /**
+ * NUTCH-1245: a fetch_gone should always result in a db_gone.
+ * <p>
+ * As some kind of misconfiguration set db.fetch.interval.default to a value
+ * > (fetchIntervalMax * 1.5).
+ * </p>
+ */
+ @Test
+ public void testCrawlDbReducerPageGoneSchedule2() {
+ LOG.info("NUTCH-1245 (misconfiguration): test with db.fetch.interval.default > (1.5 * db.fetch.interval.max)");
+ Configuration conf = CrawlDBTestUtil.createConfiguration();
+ int fetchIntervalMax = conf.getInt("db.fetch.interval.max", 0);
+ conf.setInt("db.fetch.interval.default", 3 + (int) (fetchIntervalMax * 1.5));
+ ContinuousCrawlTestUtil crawlUtil = new ContinuousCrawlTestUtil(conf,
+ STATUS_FETCH_GONE, STATUS_DB_GONE);
+ if (!crawlUtil.run(0)) {
+ fail("fetch_gone did not result in a db_gone (NUTCH-1245)");
+ }
+ }
+
+ /**
+ * Test whether signatures are reset for "content-less" states (gone,
+ * redirect, etc.): otherwise, if this state is temporary and the document
+ * appears again with the old content, it may get marked as not_modified in
+ * CrawlDb just after the redirect state. In this case we cannot expect
+ * content in segments. Cf. NUTCH-1422: reset signature for redirects.
+ */
+ // TODO: can only test if solution is done in CrawlDbReducer
+ @Test
+ public void testSignatureReset() {
+ LOG.info("NUTCH-1422 must reset signature for redirects and similar states");
+ Configuration conf = CrawlDBTestUtil.createConfiguration();
+ for (String sched : schedules) {
+ LOG.info("Testing reset signature with " + sched);
+ conf.set("db.fetch.schedule.class", "org.apache.nutch.crawl." + sched);
+ ContinuousCrawlTestUtil crawlUtil = new CrawlTestSignatureReset(conf);
+ if (!crawlUtil.run(20)) {
+ fail("failed: signature not reset");
+ }
+ }
+ }
+
+ private class CrawlTestSignatureReset extends ContinuousCrawlTestUtil {
+
+ byte[][] noContentStates = { { STATUS_FETCH_GONE, STATUS_DB_GONE },
+ { STATUS_FETCH_REDIR_TEMP, STATUS_DB_REDIR_TEMP },
+ { STATUS_FETCH_REDIR_PERM, STATUS_DB_REDIR_PERM } };
+
+ int counter = 0;
+ byte fetchState;
+
+ public CrawlTestSignatureReset(Configuration conf) {
+ super(conf);
+ }
+
+ @Override
+ protected CrawlDatum fetch(CrawlDatum datum, long currentTime) {
+ datum = super.fetch(datum, currentTime);
+ counter++;
+ // flip-flopping between successful fetch and one of content-less states
+ if (counter % 2 == 1) {
+ fetchState = STATUS_FETCH_SUCCESS;
+ } else {
+ fetchState = noContentStates[(counter % 6) / 2][0];
+ }
+ LOG.info("Step " + counter + ": fetched with "
+ + getStatusName(fetchState));
+ datum.setStatus(fetchState);
+ return datum;
+ }
+
+ @Override
+ protected boolean check(CrawlDatum result) {
+ if (result.getStatus() == STATUS_DB_NOTMODIFIED
+ && !(fetchState == STATUS_FETCH_SUCCESS || fetchState == STATUS_FETCH_NOTMODIFIED)) {
+ LOG.error("Should never get into state "
+ + getStatusName(STATUS_DB_NOTMODIFIED) + " from "
+ + getStatusName(fetchState));
+ return false;
+ }
+ if (result.getSignature() != null
+ && !(result.getStatus() == STATUS_DB_FETCHED || result.getStatus() == STATUS_DB_NOTMODIFIED)) {
+ LOG.error("Signature not reset in state "
+ + getStatusName(result.getStatus()));
+ // ok here: since it's not the problem itself (the db_notmodified), but
+ // the reason for it
+ }
+ return true;
+ }
+
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/crawl/TestGenerator.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/crawl/TestGenerator.java b/nutch-core/src/test/java/org/apache/nutch/crawl/TestGenerator.java
new file mode 100644
index 0000000..0ce3c5f
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/crawl/TestGenerator.java
@@ -0,0 +1,373 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.crawl;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.SequenceFile.Reader.Option;
+import org.apache.nutch.crawl.CrawlDBTestUtil.URLCrawlDatum;
+import org.apache.nutch.test.IntegrationTest;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+
+/**
+ * Basic generator test. 1. Insert entries in crawldb 2. Generates entries to
+ * fetch 3. Verifies that number of generated urls match 4. Verifies that
+ * highest scoring urls are generated
+ *
+ */
+@Category({IntegrationTest.class})
+public class TestGenerator {
+
+ Configuration conf;
+
+ Path dbDir;
+
+ Path segmentsDir;
+
+ FileSystem fs;
+
+ final static Path testdir = new Path("build/test/generator-test");
+
+ @Before
+ public void setUp() throws Exception {
+ conf = CrawlDBTestUtil.createConfiguration();
+ fs = FileSystem.get(conf);
+ fs.delete(testdir, true);
+ }
+
+ @After
+ public void tearDown() {
+ delete(testdir);
+ }
+
+ private void delete(Path p) {
+ try {
+ fs.delete(p, true);
+ } catch (IOException e) {
+ }
+ }
+
+ /**
+ * Test that generator generates fetchlish ordered by score (desc).
+ *
+ * @throws Exception
+ */
+ @Test
+ public void testGenerateHighest() throws Exception {
+
+ final int NUM_RESULTS = 2;
+
+ ArrayList<URLCrawlDatum> list = new ArrayList<URLCrawlDatum>();
+
+ for (int i = 0; i <= 100; i++) {
+ list.add(createURLCrawlDatum("http://aaa/" + pad(i), 1, i));
+ }
+
+ createCrawlDB(list);
+
+ Path generatedSegment = generateFetchlist(NUM_RESULTS, conf, false);
+
+ Path fetchlist = new Path(new Path(generatedSegment,
+ CrawlDatum.GENERATE_DIR_NAME), "part-00000");
+
+ ArrayList<URLCrawlDatum> l = readContents(fetchlist);
+
+ // sort urls by score desc
+ Collections.sort(l, new ScoreComparator());
+
+ // verify we got right amount of records
+ Assert.assertEquals(NUM_RESULTS, l.size());
+
+ // verify we have the highest scoring urls
+ Assert.assertEquals("http://aaa/100", (l.get(0).url.toString()));
+ Assert.assertEquals("http://aaa/099", (l.get(1).url.toString()));
+ }
+
+ private String pad(int i) {
+ String s = Integer.toString(i);
+ while (s.length() < 3) {
+ s = "0" + s;
+ }
+ return s;
+ }
+
+ /**
+ * Comparator that sorts by score desc.
+ */
+ public class ScoreComparator implements Comparator<URLCrawlDatum> {
+
+ public int compare(URLCrawlDatum tuple1, URLCrawlDatum tuple2) {
+ if (tuple2.datum.getScore() - tuple1.datum.getScore() < 0) {
+ return -1;
+ }
+ if (tuple2.datum.getScore() - tuple1.datum.getScore() > 0) {
+ return 1;
+ }
+ return 0;
+ }
+ }
+
+ /**
+ * Test that generator obeys the property "generate.max.per.host".
+ *
+ * @throws Exception
+ */
+ @Test
+ public void testGenerateHostLimit() throws Exception {
+ ArrayList<URLCrawlDatum> list = new ArrayList<URLCrawlDatum>();
+
+ list.add(createURLCrawlDatum("http://www.example.com/index1.html", 1, 1));
+ list.add(createURLCrawlDatum("http://www.example.com/index2.html", 1, 1));
+ list.add(createURLCrawlDatum("http://www.example.com/index3.html", 1, 1));
+
+ createCrawlDB(list);
+
+ Configuration myConfiguration = new Configuration(conf);
+ myConfiguration.setInt(Generator.GENERATOR_MAX_COUNT, 2);
+ Path generatedSegment = generateFetchlist(Integer.MAX_VALUE,
+ myConfiguration, false);
+
+ Path fetchlistPath = new Path(new Path(generatedSegment,
+ CrawlDatum.GENERATE_DIR_NAME), "part-00000");
+
+ ArrayList<URLCrawlDatum> fetchList = readContents(fetchlistPath);
+
+ // verify we got right amount of records
+ Assert.assertEquals(1, fetchList.size());
+
+ myConfiguration = new Configuration(conf);
+ myConfiguration.setInt(Generator.GENERATOR_MAX_COUNT, 3);
+ generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration,
+ false);
+
+ fetchlistPath = new Path(new Path(generatedSegment,
+ CrawlDatum.GENERATE_DIR_NAME), "part-00000");
+
+ fetchList = readContents(fetchlistPath);
+
+ // verify we got right amount of records
+ Assert.assertEquals(2, fetchList.size());
+
+ myConfiguration = new Configuration(conf);
+ myConfiguration.setInt(Generator.GENERATOR_MAX_COUNT, 4);
+ generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration,
+ false);
+
+ fetchlistPath = new Path(new Path(generatedSegment,
+ CrawlDatum.GENERATE_DIR_NAME), "part-00000");
+
+ fetchList = readContents(fetchlistPath);
+
+ // verify we got right amount of records
+ Assert.assertEquals(3, fetchList.size());
+ }
+
+ /**
+ * Test that generator obeys the property "generator.max.count" and
+ * "generator.count.per.domain".
+ *
+ * @throws Exception
+ */
+ @Test
+ public void testGenerateDomainLimit() throws Exception {
+ ArrayList<URLCrawlDatum> list = new ArrayList<URLCrawlDatum>();
+
+ list.add(createURLCrawlDatum("http://a.example.com/index.html", 1, 1));
+ list.add(createURLCrawlDatum("http://b.example.com/index.html", 1, 1));
+ list.add(createURLCrawlDatum("http://c.example.com/index.html", 1, 1));
+
+ createCrawlDB(list);
+
+ Configuration myConfiguration = new Configuration(conf);
+ myConfiguration.setInt(Generator.GENERATOR_MAX_COUNT, 2);
+ myConfiguration.set(Generator.GENERATOR_COUNT_MODE,
+ Generator.GENERATOR_COUNT_VALUE_DOMAIN);
+
+ Path generatedSegment = generateFetchlist(Integer.MAX_VALUE,
+ myConfiguration, false);
+
+ Path fetchlistPath = new Path(new Path(generatedSegment,
+ CrawlDatum.GENERATE_DIR_NAME), "part-00000");
+
+ ArrayList<URLCrawlDatum> fetchList = readContents(fetchlistPath);
+
+ // verify we got right amount of records
+ Assert.assertEquals(1, fetchList.size());
+
+ myConfiguration = new Configuration(myConfiguration);
+ myConfiguration.setInt(Generator.GENERATOR_MAX_COUNT, 3);
+ generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration,
+ false);
+
+ fetchlistPath = new Path(new Path(generatedSegment,
+ CrawlDatum.GENERATE_DIR_NAME), "part-00000");
+
+ fetchList = readContents(fetchlistPath);
+
+ // verify we got right amount of records
+ Assert.assertEquals(2, fetchList.size());
+
+ myConfiguration = new Configuration(myConfiguration);
+ myConfiguration.setInt(Generator.GENERATOR_MAX_COUNT, 4);
+ generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration,
+ false);
+
+ fetchlistPath = new Path(new Path(generatedSegment,
+ CrawlDatum.GENERATE_DIR_NAME), "part-00000");
+
+ fetchList = readContents(fetchlistPath);
+
+ // verify we got right amount of records
+ Assert.assertEquals(3, fetchList.size());
+ }
+
+ /**
+ * Test generator obeys the filter setting.
+ *
+ * @throws Exception
+ * @throws IOException
+ */
+ @Test
+ public void testFilter() throws IOException, Exception {
+
+ ArrayList<URLCrawlDatum> list = new ArrayList<URLCrawlDatum>();
+
+ list.add(createURLCrawlDatum("http://www.example.com/index.html", 1, 1));
+ list.add(createURLCrawlDatum("http://www.example.net/index.html", 1, 1));
+ list.add(createURLCrawlDatum("http://www.example.org/index.html", 1, 1));
+
+ createCrawlDB(list);
+
+ Configuration myConfiguration = new Configuration(conf);
+ myConfiguration.set("urlfilter.suffix.file", "filter-all.txt");
+
+ Path generatedSegment = generateFetchlist(Integer.MAX_VALUE,
+ myConfiguration, true);
+
+ Assert.assertNull("should be null (0 entries)", generatedSegment);
+
+ generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration,
+ false);
+
+ Path fetchlistPath = new Path(new Path(generatedSegment,
+ CrawlDatum.GENERATE_DIR_NAME), "part-00000");
+
+ ArrayList<URLCrawlDatum> fetchList = readContents(fetchlistPath);
+
+ // verify nothing got filtered
+ Assert.assertEquals(list.size(), fetchList.size());
+
+ }
+
+ /**
+ * Read contents of fetchlist.
+ *
+ * @param fetchlist
+ * path to Generated fetchlist
+ * @return Generated {@link URLCrawlDatum} objects
+ * @throws IOException
+ */
+ private ArrayList<URLCrawlDatum> readContents(Path fetchlist)
+ throws IOException {
+ // verify results
+ Option rFile = SequenceFile.Reader.file(fetchlist);
+ SequenceFile.Reader reader = new SequenceFile.Reader(conf, rFile);
+
+ ArrayList<URLCrawlDatum> l = new ArrayList<URLCrawlDatum>();
+
+ READ: do {
+ Text key = new Text();
+ CrawlDatum value = new CrawlDatum();
+ if (!reader.next(key, value)) {
+ break READ;
+ }
+ l.add(new URLCrawlDatum(key, value));
+ } while (true);
+
+ reader.close();
+ return l;
+ }
+
+ /**
+ * Generate Fetchlist.
+ *
+ * @param numResults
+ * number of results to generate
+ * @param config
+ * Configuration to use
+ * @return path to generated segment
+ * @throws IOException
+ */
+ private Path generateFetchlist(int numResults, Configuration config,
+ boolean filter) throws IOException {
+ // generate segment
+ Generator g = new Generator(config);
+ Path[] generatedSegment = g.generate(dbDir, segmentsDir, -1, numResults,
+ Long.MAX_VALUE, filter, false);
+ if (generatedSegment == null)
+ return null;
+ return generatedSegment[0];
+ }
+
+ /**
+ * Creates CrawlDB.
+ *
+ * @param list
+ * database contents
+ * @throws IOException
+ * @throws Exception
+ */
+ private void createCrawlDB(ArrayList<URLCrawlDatum> list) throws IOException,
+ Exception {
+ dbDir = new Path(testdir, "crawldb");
+ segmentsDir = new Path(testdir, "segments");
+ fs.mkdirs(dbDir);
+ fs.mkdirs(segmentsDir);
+
+ // create crawldb
+ CrawlDBTestUtil.createCrawlDb(conf, fs, dbDir, list);
+ }
+
+ /**
+ * Constructs new {@link URLCrawlDatum} from submitted parameters.
+ *
+ * @param url
+ * url to use
+ * @param fetchInterval
+ * {@link CrawlDatum#setFetchInterval(float)}
+ * @param score
+ * {@link CrawlDatum#setScore(float)}
+ * @return Constructed object
+ */
+ private URLCrawlDatum createURLCrawlDatum(final String url,
+ final int fetchInterval, final float score) {
+ return new CrawlDBTestUtil.URLCrawlDatum(new Text(url), new CrawlDatum(
+ CrawlDatum.STATUS_DB_UNFETCHED, fetchInterval, score));
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/crawl/TestInjector.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/crawl/TestInjector.java b/nutch-core/src/test/java/org/apache/nutch/crawl/TestInjector.java
new file mode 100644
index 0000000..59a3e8c
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/crawl/TestInjector.java
@@ -0,0 +1,184 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.crawl;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.SequenceFile.Reader.Option;
+import org.apache.nutch.test.IntegrationTest;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+
+/**
+ * Basic injector test: 1. Creates a text file with urls 2. Injects them into
+ * crawldb 3. Reads crawldb entries and verifies contents 4. Injects more urls
+ * into webdb 5. Reads crawldb entries and verifies contents
+ *
+ */
+@Category({IntegrationTest.class})
+public class TestInjector {
+
+ private Configuration conf;
+ private FileSystem fs;
+ final static Path testdir = new Path("build/test/inject-test");
+ Path crawldbPath;
+ Path urlPath;
+
+ @Before
+ public void setUp() throws Exception {
+ conf = CrawlDBTestUtil.createConfiguration();
+ urlPath = new Path(testdir, "urls");
+ crawldbPath = new Path(testdir, "crawldb");
+ fs = FileSystem.get(conf);
+ if (fs.exists(urlPath))
+ fs.delete(urlPath, false);
+ if (fs.exists(crawldbPath))
+ fs.delete(crawldbPath, true);
+ }
+
+ @After
+ public void tearDown() throws IOException {
+ fs.delete(testdir, true);
+ }
+
+ @Test
+ public void testInject()
+ throws IOException, ClassNotFoundException, InterruptedException {
+ ArrayList<String> urls = new ArrayList<String>();
+ // We'll use a separate list for MD so we can still compare url with
+ // containsAll
+ ArrayList<String> metadata = new ArrayList<String>();
+ for (int i = 0; i < 100; i++) {
+ urls.add("http://zzz.com/" + i + ".html");
+ metadata.add("\tnutch.score=2." + i
+ + "\tnutch.fetchInterval=171717\tkey=value");
+ }
+ CrawlDBTestUtil.generateSeedList(fs, urlPath, urls, metadata);
+
+ Injector injector = new Injector(conf);
+ injector.inject(crawldbPath, urlPath);
+
+ // verify results
+ List<String> read = readCrawldb();
+
+ Collections.sort(read);
+ Collections.sort(urls);
+
+ Assert.assertEquals(urls.size(), read.size());
+
+ Assert.assertTrue(read.containsAll(urls));
+ Assert.assertTrue(urls.containsAll(read));
+
+ // inject more urls
+ ArrayList<String> urls2 = new ArrayList<String>();
+ for (int i = 0; i < 100; i++) {
+ urls2.add("http://xxx.com/" + i + ".html");
+ // We'll overwrite previously injected records but preserve their original
+ // MD
+ urls2.add("http://zzz.com/" + i + ".html");
+ }
+ CrawlDBTestUtil.generateSeedList(fs, urlPath, urls2);
+ injector = new Injector(conf);
+ conf.setBoolean("db.injector.update", true);
+ injector.inject(crawldbPath, urlPath);
+ urls.addAll(urls2);
+
+ // verify results
+ read = readCrawldb();
+
+ Collections.sort(read);
+ Collections.sort(urls);
+
+ // We should have 100 less records because we've overwritten
+ Assert.assertEquals(urls.size() - 100, read.size());
+
+ Assert.assertTrue(read.containsAll(urls));
+ Assert.assertTrue(urls.containsAll(read));
+
+ // Check if we correctly preserved MD
+ Map<String, CrawlDatum> records = readCrawldbRecords();
+
+ // Iterate over the urls, we're looking for http://zzz.com/ prefixed URLs
+ // so we can check for MD and score and interval
+ Text writableKey = new Text("key");
+ Text writableValue = new Text("value");
+ for (String url : urls) {
+ if (url.indexOf("http://zzz") == 0) {
+ // Check for fetch interval
+ Assert.assertTrue(records.get(url).getFetchInterval() == 171717);
+ // Check for default score
+ Assert.assertTrue(records.get(url).getScore() != 1.0);
+ // Check for MD key=value
+ Assert.assertEquals(writableValue,
+ records.get(url).getMetaData().get(writableKey));
+ }
+ }
+ }
+
+ private List<String> readCrawldb() throws IOException {
+ Path dbfile = new Path(crawldbPath, CrawlDb.CURRENT_NAME
+ + "/part-r-00000/data");
+ System.out.println("reading:" + dbfile);
+ Option rFile = SequenceFile.Reader.file(dbfile);
+ @SuppressWarnings("resource")
+ SequenceFile.Reader reader = new SequenceFile.Reader(conf, rFile);
+ ArrayList<String> read = new ArrayList<String>();
+
+ READ: do {
+ Text key = new Text();
+ CrawlDatum value = new CrawlDatum();
+ if (!reader.next(key, value))
+ break READ;
+ read.add(key.toString());
+ } while (true);
+
+ return read;
+ }
+
+ private HashMap<String, CrawlDatum> readCrawldbRecords() throws IOException {
+ Path dbfile = new Path(crawldbPath, CrawlDb.CURRENT_NAME
+ + "/part-r-00000/data");
+ System.out.println("reading:" + dbfile);
+ Option rFile = SequenceFile.Reader.file(dbfile);
+ @SuppressWarnings("resource")
+ SequenceFile.Reader reader = new SequenceFile.Reader(conf, rFile);
+ HashMap<String, CrawlDatum> read = new HashMap<String, CrawlDatum>();
+
+ READ: do {
+ Text key = new Text();
+ CrawlDatum value = new CrawlDatum();
+ if (!reader.next(key, value))
+ break READ;
+ read.put(key.toString(), value);
+ } while (true);
+
+ return read;
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/crawl/TestLinkDbMerger.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/crawl/TestLinkDbMerger.java b/nutch-core/src/test/java/org/apache/nutch/crawl/TestLinkDbMerger.java
new file mode 100644
index 0000000..23aaa88
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/crawl/TestLinkDbMerger.java
@@ -0,0 +1,160 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.TreeMap;
+import java.util.logging.Logger;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.MapFile;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.MapFile.Writer.Option;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+public class TestLinkDbMerger {
+ private static final Logger LOG = Logger.getLogger(TestLinkDbMerger.class
+ .getName());
+
+ String url10 = "http://example.com/foo";
+ String[] urls10 = new String[] { "http://example.com/100",
+ "http://example.com/101" };
+
+ String url11 = "http://example.com/";
+ String[] urls11 = new String[] { "http://example.com/110",
+ "http://example.com/111" };
+
+ String url20 = "http://example.com/";
+ String[] urls20 = new String[] { "http://foo.com/200", "http://foo.com/201" };
+ String url21 = "http://example.com/bar";
+ String[] urls21 = new String[] { "http://foo.com/210", "http://foo.com/211" };
+
+ String[] urls10_expected = urls10;
+ String[] urls11_expected = new String[] { urls11[0], urls11[1], urls20[0],
+ urls20[1] };
+ String[] urls20_expected = urls11_expected;
+ String[] urls21_expected = urls21;
+
+ TreeMap<String, String[]> init1 = new TreeMap<String, String[]>();
+ TreeMap<String, String[]> init2 = new TreeMap<String, String[]>();
+ HashMap<String, String[]> expected = new HashMap<String, String[]>();
+ Configuration conf;
+ Path testDir;
+ FileSystem fs;
+ LinkDbReader reader;
+
+ @Before
+ public void setUp() throws Exception {
+ init1.put(url10, urls10);
+ init1.put(url11, urls11);
+ init2.put(url20, urls20);
+ init2.put(url21, urls21);
+ expected.put(url10, urls10_expected);
+ expected.put(url11, urls11_expected);
+ expected.put(url20, urls20_expected);
+ expected.put(url21, urls21_expected);
+ conf = NutchConfiguration.create();
+ fs = FileSystem.get(conf);
+ testDir = new Path("build/test/test-linkdb-"
+ + new java.util.Random().nextInt());
+ fs.mkdirs(testDir);
+ }
+
+ @After
+ public void tearDown() {
+ try {
+ if (fs.exists(testDir))
+ fs.delete(testDir, true);
+ } catch (Exception e) {
+ }
+ try {
+ reader.close();
+ } catch (Exception e) {
+ }
+ }
+
+ @Test
+ public void testMerge() throws Exception {
+ Configuration conf = NutchConfiguration.create();
+ FileSystem fs = FileSystem.get(conf);
+ fs.mkdirs(testDir);
+ Path linkdb1 = new Path(testDir, "linkdb1");
+ Path linkdb2 = new Path(testDir, "linkdb2");
+ Path output = new Path(testDir, "output");
+ createLinkDb(conf, fs, linkdb1, init1);
+ createLinkDb(conf, fs, linkdb2, init2);
+ LinkDbMerger merger = new LinkDbMerger(conf);
+ LOG.fine("* merging linkdbs to " + output);
+ merger.merge(output, new Path[] { linkdb1, linkdb2 }, false, false);
+ LOG.fine("* reading linkdb: " + output);
+ reader = new LinkDbReader(conf, output);
+ Iterator<String> it = expected.keySet().iterator();
+ while (it.hasNext()) {
+ String url = it.next();
+ LOG.fine("url=" + url);
+ String[] vals = expected.get(url);
+ Inlinks inlinks = reader.getInlinks(new Text(url));
+ // may not be null
+ Assert.assertNotNull(inlinks);
+ ArrayList<String> links = new ArrayList<String>();
+ Iterator<?> it2 = inlinks.iterator();
+ while (it2.hasNext()) {
+ Inlink in = (Inlink) it2.next();
+ links.add(in.getFromUrl());
+ }
+ for (int i = 0; i < vals.length; i++) {
+ LOG.fine(" -> " + vals[i]);
+ Assert.assertTrue(links.contains(vals[i]));
+ }
+ }
+ reader.close();
+ fs.delete(testDir, true);
+ }
+
+ private void createLinkDb(Configuration config, FileSystem fs, Path linkdb,
+ TreeMap<String, String[]> init) throws Exception {
+ LOG.fine("* creating linkdb: " + linkdb);
+ Path dir = new Path(linkdb, LinkDb.CURRENT_NAME);
+
+ Option wKeyOpt = MapFile.Writer.keyClass(Text.class);
+ org.apache.hadoop.io.SequenceFile.Writer.Option wValueOpt = SequenceFile.Writer.valueClass(Inlinks.class);
+ MapFile.Writer writer = new MapFile.Writer(config, new Path(dir,
+ "part-00000"), wKeyOpt, wValueOpt);
+ Iterator<String> it = init.keySet().iterator();
+ while (it.hasNext()) {
+ String key = it.next();
+ Inlinks inlinks = new Inlinks();
+ String[] vals = init.get(key);
+ for (int i = 0; i < vals.length; i++) {
+ Inlink in = new Inlink(vals[i], vals[i]);
+ inlinks.add(in);
+ }
+ writer.append(new Text(key), inlinks);
+ }
+ writer.close();
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/crawl/TestSignatureFactory.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/crawl/TestSignatureFactory.java b/nutch-core/src/test/java/org/apache/nutch/crawl/TestSignatureFactory.java
new file mode 100644
index 0000000..db82d7a
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/crawl/TestSignatureFactory.java
@@ -0,0 +1,35 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.crawl;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Test;
+
+public class TestSignatureFactory {
+
+ @Test
+ public void testGetSignature() {
+ Configuration conf = NutchConfiguration.create();
+ Signature signature1 = SignatureFactory.getSignature(conf);
+ Signature signature2 = SignatureFactory.getSignature(conf);
+ Assert.assertNotNull(signature1);
+ Assert.assertNotNull(signature2);
+ Assert.assertEquals(signature1, signature2);
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/fetcher/TestFetcher.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/fetcher/TestFetcher.java b/nutch-core/src/test/java/org/apache/nutch/fetcher/TestFetcher.java
new file mode 100644
index 0000000..a23d080
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/fetcher/TestFetcher.java
@@ -0,0 +1,210 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.fetcher;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDBTestUtil;
+import org.apache.nutch.crawl.Generator;
+import org.apache.nutch.crawl.Injector;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.test.IntegrationTest;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+import org.mortbay.jetty.Server;
+
+/**
+ * Basic fetcher test 1. generate seedlist 2. inject 3. generate 3. fetch 4.
+ * Verify contents
+ *
+ */
+public class TestFetcher {
+
+ final static Path testdir = new Path("build/test/fetch-test");
+ Configuration conf;
+ FileSystem fs;
+ Path crawldbPath;
+ Path segmentsPath;
+ Path urlPath;
+ Server server;
+
+ @Before
+ public void setUp() throws Exception {
+ conf = CrawlDBTestUtil.createConfiguration();
+ fs = FileSystem.get(conf);
+ fs.delete(testdir, true);
+ urlPath = new Path(testdir, "urls");
+ crawldbPath = new Path(testdir, "crawldb");
+ segmentsPath = new Path(testdir, "segments");
+ server = CrawlDBTestUtil.getServer(
+ conf.getInt("content.server.port", 50000),
+ "build/test/data/fetch-test-site");
+ server.start();
+ }
+
+ @After
+ public void tearDown() throws Exception {
+ server.stop();
+ for (int i = 0; i < 5; i++) {
+ if (!server.isStopped()) {
+ Thread.sleep(1000);
+ }
+ }
+ fs.delete(testdir, true);
+ }
+
+ @Test
+ @Category(IntegrationTest.class)
+ public void testFetch() throws IOException, ClassNotFoundException, InterruptedException {
+
+ // generate seedlist
+ ArrayList<String> urls = new ArrayList<String>();
+
+ addUrl(urls, "index.html");
+ addUrl(urls, "pagea.html");
+ addUrl(urls, "pageb.html");
+ addUrl(urls, "dup_of_pagea.html");
+ addUrl(urls, "nested_spider_trap.html");
+ addUrl(urls, "exception.html");
+
+ CrawlDBTestUtil.generateSeedList(fs, urlPath, urls);
+
+ // inject
+ Injector injector = new Injector(conf);
+ injector.inject(crawldbPath, urlPath);
+
+ // generate
+ Generator g = new Generator(conf);
+ Path[] generatedSegment = g.generate(crawldbPath, segmentsPath, 1,
+ Long.MAX_VALUE, Long.MAX_VALUE, false, false);
+
+ long time = System.currentTimeMillis();
+ // fetch
+ Fetcher fetcher = new Fetcher(conf);
+
+ // Set fetcher.parse to true
+ conf.setBoolean("fetcher.parse", true);
+
+ fetcher.fetch(generatedSegment[0], 1);
+
+ time = System.currentTimeMillis() - time;
+
+ // verify politeness, time taken should be more than (num_of_pages +1)*delay
+ int minimumTime = (int) ((urls.size() + 1) * 1000 * conf.getFloat(
+ "fetcher.server.delay", 5));
+ Assert.assertTrue(time > minimumTime);
+
+ // verify content
+ Path content = new Path(new Path(generatedSegment[0], Content.DIR_NAME),
+ "part-00000/data");
+ @SuppressWarnings("resource")
+ SequenceFile.Reader reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(content));
+
+ ArrayList<String> handledurls = new ArrayList<String>();
+
+ READ_CONTENT: do {
+ Text key = new Text();
+ Content value = new Content();
+ if (!reader.next(key, value))
+ break READ_CONTENT;
+ String contentString = new String(value.getContent());
+ if (contentString.indexOf("Nutch fetcher test page") != -1) {
+ handledurls.add(key.toString());
+ }
+ } while (true);
+
+ reader.close();
+
+ Collections.sort(urls);
+ Collections.sort(handledurls);
+
+ // verify that enough pages were handled
+ Assert.assertEquals(urls.size(), handledurls.size());
+
+ // verify that correct pages were handled
+ Assert.assertTrue(handledurls.containsAll(urls));
+ Assert.assertTrue(urls.containsAll(handledurls));
+
+ handledurls.clear();
+
+ // verify parse data
+ Path parseData = new Path(
+ new Path(generatedSegment[0], ParseData.DIR_NAME), "part-00000/data");
+ reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(parseData));
+
+ READ_PARSE_DATA: do {
+ Text key = new Text();
+ ParseData value = new ParseData();
+ if (!reader.next(key, value))
+ break READ_PARSE_DATA;
+ // make sure they all contain "nutch.segment.name" and
+ // "nutch.content.digest"
+ // keys in parse metadata
+ Metadata contentMeta = value.getContentMeta();
+ if (contentMeta.get(Nutch.SEGMENT_NAME_KEY) != null
+ && contentMeta.get(Nutch.SIGNATURE_KEY) != null) {
+ handledurls.add(key.toString());
+ }
+ } while (true);
+
+ Collections.sort(handledurls);
+
+ Assert.assertEquals(urls.size(), handledurls.size());
+
+ Assert.assertTrue(handledurls.containsAll(urls));
+ Assert.assertTrue(urls.containsAll(handledurls));
+ }
+
+ private void addUrl(ArrayList<String> urls, String page) {
+ urls.add("http://127.0.0.1:" + server.getConnectors()[0].getPort() + "/"
+ + page);
+ }
+
+ @Test
+ public void testAgentNameCheck() {
+
+ boolean failedNoAgentName = false;
+ conf.set("http.agent.name", "");
+
+ try {
+ conf.setBoolean("fetcher.parse", false);
+ Fetcher fetcher = new Fetcher(conf);
+ fetcher.fetch(null, 1);
+ } catch (IllegalArgumentException iae) {
+ String message = iae.getMessage();
+ failedNoAgentName = message.equals("Fetcher: No agents listed in "
+ + "'http.agent.name' property.");
+ } catch (Exception e) {
+ }
+
+ Assert.assertTrue(failedNoAgentName);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/indexer/TestIndexerMapReduce.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/indexer/TestIndexerMapReduce.java b/nutch-core/src/test/java/org/apache/nutch/indexer/TestIndexerMapReduce.java
new file mode 100644
index 0000000..3a25f26
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/indexer/TestIndexerMapReduce.java
@@ -0,0 +1,190 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.indexer;
+
+import org.apache.commons.codec.binary.Base64;
+import org.apache.hadoop.mrunit.ReduceDriver;
+import org.apache.hadoop.mrunit.types.Pair;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.NutchWritable;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.parse.ParseText;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.test.IntegrationTest;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.Reducer;
+
+import static org.junit.Assert.*;
+
+import java.io.IOException;
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+
+/** Test {@link IndexerMapReduce} */
+public class TestIndexerMapReduce {
+
+ private static final Logger LOG = LoggerFactory
+ .getLogger(TestIndexerMapReduce.class);
+
+ public static String testUrl = "http://nutch.apache.org/";
+ public static Text testUrlText = new Text(testUrl);
+ public static String htmlContentType = "text/html";
+ public static String testHtmlDoc = "<!DOCTYPE html>\n"
+ + "<html>\n"
+ + "<head>\n"
+ + "<title>Test Indexing Binary Content</title>\n"
+ + "<meta charset=\"utf-8\">\n"
+ + "<meta name=\"keywords\" lang=\"en\" content=\"charset, encoding\" />\n"
+ + "<meta name=\"keywords\" lang=\"fr\" content=\"codage des caract�res\" />\n"
+ + "<meta name=\"keywords\" lang=\"cs\" content=\"k�dov�n� znak\u016f\" />\n"
+ + "</head>\n"
+ + "<body>\n"
+ + "<p>\n"
+ + "<ul>\n"
+ + " <li lang=\"en\">English: character set, encoding\n"
+ + " <li lang=\"fr\">Fran�ais: codage des caract�res\n"
+ + " <li lang=\"cs\">\u010ce\u0161tina: k�dov�n� znak\u016f (not covered by Latin-1)\n"
+ + "</ul>\n"
+ + "</body>\n"
+ + "</html>";
+ public static Metadata htmlMeta = new Metadata();
+ static {
+ htmlMeta.add("Content-Type", "text/html");
+ // add segment and signature to avoid NPEs
+ htmlMeta.add(Nutch.SEGMENT_NAME_KEY, "123");
+ htmlMeta.add(Nutch.SIGNATURE_KEY, "123");
+ }
+ public static ParseText parseText = new ParseText("Test");
+ public static ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS,
+ "Test", new Outlink[] {}, htmlMeta);
+ public static CrawlDatum crawlDatumDbFetched = new CrawlDatum(
+ CrawlDatum.STATUS_DB_FETCHED, 60 * 60 * 24);
+ public static CrawlDatum crawlDatumFetchSuccess = new CrawlDatum(
+ CrawlDatum.STATUS_FETCH_SUCCESS, 60 * 60 * 24);
+
+ private Reducer<Text, NutchWritable, Text, NutchIndexAction> reducer = new IndexerMapReduce();
+ private ReduceDriver<Text, NutchWritable, Text, NutchIndexAction> reduceDriver;
+ private Configuration configuration;
+
+
+ /**
+ * Test indexing of base64-encoded binary content.
+ */
+ @Test
+ @Category(IntegrationTest.class)
+ public void testBinaryContentBase64() {
+ configuration = NutchConfiguration.create();
+ configuration.setBoolean(IndexerMapReduce.INDEXER_BINARY_AS_BASE64, true);
+
+ Charset[] testCharsets = { StandardCharsets.UTF_8,
+ Charset.forName("iso-8859-1"), Charset.forName("iso-8859-2") };
+ for (Charset charset : testCharsets) {
+ LOG.info("Testing indexing binary content as base64 for charset {}",
+ charset.name());
+
+ String htmlDoc = testHtmlDoc;
+ if (charset != StandardCharsets.UTF_8) {
+ htmlDoc = htmlDoc.replaceAll("utf-8", charset.name());
+ if (charset.name().equalsIgnoreCase("iso-8859-1")) {
+ // Western-European character set: remove Czech content
+ htmlDoc = htmlDoc.replaceAll("\\s*<[^>]+\\slang=\"cs\".+?\\n", "");
+ } else if (charset.name().equalsIgnoreCase("iso-8859-2")) {
+ // Eastern-European character set: remove French content
+ htmlDoc = htmlDoc.replaceAll("\\s*<[^>]+\\slang=\"fr\".+?\\n", "");
+ }
+ }
+
+ Content content = new Content(testUrl, testUrl,
+ htmlDoc.getBytes(charset), htmlContentType, htmlMeta,
+ configuration);
+
+ NutchDocument doc = runIndexer(crawlDatumDbFetched,
+ crawlDatumFetchSuccess, parseText, parseData, content);
+ assertNotNull("No NutchDocument indexed", doc);
+
+ String binaryContentBase64 = (String) doc.getField("binaryContent")
+ .getValues().get(0);
+ LOG.info("binary content (base64): {}", binaryContentBase64);
+ String binaryContent = new String(
+ Base64.decodeBase64(binaryContentBase64), charset);
+ LOG.info("binary content (decoded): {}", binaryContent);
+ assertEquals(
+ "Binary content (" + charset + ") not correctly saved as base64",
+ htmlDoc, binaryContent);
+ }
+ }
+
+ /**
+ * Run {@link IndexerMapReduce.reduce(...)} to get a "indexed"
+ * {@link NutchDocument} by passing objects from segment and CrawlDb to the
+ * indexer.
+ *
+ * @param dbDatum
+ * crawl datum from CrawlDb
+ * @param fetchDatum
+ * crawl datum (fetch status) from segment
+ * @param parseText
+ * plain text from parsed document
+ * @param parseData
+ * parse data
+ * @param content
+ * (optional, if index binary content) protocol content
+ * @return "indexed" document
+ */
+ public NutchDocument runIndexer(CrawlDatum dbDatum, CrawlDatum fetchDatum,
+ ParseText parseText, ParseData parseData, Content content) {
+ List<NutchWritable> values = new ArrayList<NutchWritable>();
+ values.add(new NutchWritable(dbDatum));
+ values.add(new NutchWritable(fetchDatum));
+ values.add(new NutchWritable(parseText));
+ values.add(new NutchWritable(parseData));
+ values.add(new NutchWritable(content));
+ reduceDriver = ReduceDriver.newReduceDriver(reducer);
+ reduceDriver.setConfiguration(configuration);
+ reduceDriver.withInput(testUrlText, values);
+ List<Pair<Text, NutchIndexAction>> reduceResult;
+ NutchDocument doc = null;
+ try {
+ reduceResult = reduceDriver.run();
+ for (Pair<Text, NutchIndexAction> p : reduceResult) {
+ if (p.getSecond().action != NutchIndexAction.DELETE) {
+ doc = p.getSecond().doc;
+ }
+ }
+ } catch (IOException e) {
+ LOG.error(StringUtils.stringifyException(e));
+ }
+ return doc;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/indexer/TestIndexingFilters.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/indexer/TestIndexingFilters.java b/nutch-core/src/test/java/org/apache/nutch/indexer/TestIndexingFilters.java
new file mode 100644
index 0000000..14b246b
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/indexer/TestIndexingFilters.java
@@ -0,0 +1,113 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.test.IntegrationTest;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+
+@Category(IntegrationTest.class)
+public class TestIndexingFilters {
+
+ /**
+ * Test behaviour when defined filter does not exist.
+ *
+ * @throws IndexingException
+ */
+ @Test
+ public void testNonExistingIndexingFilter() throws IndexingException {
+ Configuration conf = NutchConfiguration.create();
+ conf.addResource("nutch-default.xml");
+ conf.addResource("crawl-tests.xml");
+
+ String class1 = "NonExistingFilter";
+ String class2 = "org.apache.nutch.indexer.basic.BasicIndexingFilter";
+ conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1 + " " + class2);
+
+ IndexingFilters filters = new IndexingFilters(conf);
+ filters.filter(new NutchDocument(), new ParseImpl("text", new ParseData(
+ new ParseStatus(), "title", new Outlink[0], new Metadata())), new Text(
+ "http://www.example.com/"), new CrawlDatum(), new Inlinks());
+ }
+
+ /**
+ * Test behaviour when NutchDOcument is null
+ */
+ @Test
+ public void testNutchDocumentNullIndexingFilter() throws IndexingException {
+ Configuration conf = NutchConfiguration.create();
+ conf.addResource("nutch-default.xml");
+ conf.addResource("crawl-tests.xml");
+
+ IndexingFilters filters = new IndexingFilters(conf);
+ NutchDocument doc = filters.filter(null, new ParseImpl("text",
+ new ParseData(new ParseStatus(), "title", new Outlink[0],
+ new Metadata())), new Text("http://www.example.com/"),
+ new CrawlDatum(), new Inlinks());
+
+ Assert.assertNull(doc);
+ }
+
+ /**
+ * Test behaviour when reset the index filter order will not take effect
+ *
+ * @throws IndexingException
+ */
+ @Test
+ public void testFilterCacheIndexingFilter() throws IndexingException {
+ Configuration conf = NutchConfiguration.create();
+ conf.addResource("nutch-default.xml");
+ conf.addResource("crawl-tests.xml");
+
+ String class1 = "org.apache.nutch.indexer.basic.BasicIndexingFilter";
+ conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1);
+
+ IndexingFilters filters1 = new IndexingFilters(conf);
+ NutchDocument fdoc1 = filters1.filter(new NutchDocument(), new ParseImpl(
+ "text", new ParseData(new ParseStatus(), "title", new Outlink[0],
+ new Metadata())), new Text("http://www.example.com/"),
+ new CrawlDatum(), new Inlinks());
+
+ // add another index filter
+ String class2 = "org.apache.nutch.indexer.metadata.MetadataIndexer";
+ // set content metadata
+ Metadata md = new Metadata();
+ md.add("example", "data");
+ // set content metadata property defined in MetadataIndexer
+ conf.set("index.content.md", "example");
+ // add MetadataIndxer filter
+ conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1 + " " + class2);
+ IndexingFilters filters2 = new IndexingFilters(conf);
+ NutchDocument fdoc2 = filters2.filter(new NutchDocument(), new ParseImpl(
+ "text", new ParseData(new ParseStatus(), "title", new Outlink[0], md)),
+ new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
+ Assert.assertEquals(fdoc1.getFieldNames().size(), fdoc2.getFieldNames()
+ .size());
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/metadata/TestMetadata.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/metadata/TestMetadata.java b/nutch-core/src/test/java/org/apache/nutch/metadata/TestMetadata.java
new file mode 100644
index 0000000..f3a320d
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/metadata/TestMetadata.java
@@ -0,0 +1,281 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.metadata;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.util.Properties;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+/**
+ * JUnit based tests of class {@link org.apache.nutch.metadata.Metadata}.
+ */
+public class TestMetadata {
+
+ private static final String CONTENTTYPE = "contenttype";
+
+ /**
+ * Test to ensure that only non-null values get written when the
+ * {@link Metadata} object is written using a Writeable.
+ *
+ * @since NUTCH-406
+ *
+ */
+ @Test
+ public void testWriteNonNull() {
+ Metadata met = new Metadata();
+ met.add(CONTENTTYPE, null);
+ met.add(CONTENTTYPE, "text/bogus");
+ met.add(CONTENTTYPE, "text/bogus2");
+ met = writeRead(met);
+
+ Assert.assertNotNull(met);
+ Assert.assertEquals(met.size(), 1);
+
+ boolean hasBogus = false, hasBogus2 = false;
+
+ String[] values = met.getValues(CONTENTTYPE);
+ Assert.assertNotNull(values);
+ Assert.assertEquals(values.length, 2);
+
+ for (int i = 0; i < values.length; i++) {
+ if (values[i].equals("text/bogus")) {
+ hasBogus = true;
+ }
+
+ if (values[i].equals("text/bogus2")) {
+ hasBogus2 = true;
+ }
+ }
+
+ Assert.assertTrue(hasBogus && hasBogus2);
+ }
+
+ /** Test for the <code>add(String, String)</code> method. */
+ @Test
+ public void testAdd() {
+ String[] values = null;
+ Metadata meta = new Metadata();
+
+ values = meta.getValues(CONTENTTYPE);
+ Assert.assertEquals(0, values.length);
+
+ meta.add(CONTENTTYPE, "value1");
+ values = meta.getValues(CONTENTTYPE);
+ Assert.assertEquals(1, values.length);
+ Assert.assertEquals("value1", values[0]);
+
+ meta.add(CONTENTTYPE, "value2");
+ values = meta.getValues(CONTENTTYPE);
+ Assert.assertEquals(2, values.length);
+ Assert.assertEquals("value1", values[0]);
+ Assert.assertEquals("value2", values[1]);
+
+ // NOTE : For now, the same value can be added many times.
+ // Should it be changed?
+ meta.add(CONTENTTYPE, "value1");
+ values = meta.getValues(CONTENTTYPE);
+ Assert.assertEquals(3, values.length);
+ Assert.assertEquals("value1", values[0]);
+ Assert.assertEquals("value2", values[1]);
+ Assert.assertEquals("value1", values[2]);
+ }
+
+ /** Test for the <code>set(String, String)</code> method. */
+ @Test
+ public void testSet() {
+ String[] values = null;
+ Metadata meta = new Metadata();
+
+ values = meta.getValues(CONTENTTYPE);
+ Assert.assertEquals(0, values.length);
+
+ meta.set(CONTENTTYPE, "value1");
+ values = meta.getValues(CONTENTTYPE);
+ Assert.assertEquals(1, values.length);
+ Assert.assertEquals("value1", values[0]);
+
+ meta.set(CONTENTTYPE, "value2");
+ values = meta.getValues(CONTENTTYPE);
+ Assert.assertEquals(1, values.length);
+ Assert.assertEquals("value2", values[0]);
+
+ meta.set(CONTENTTYPE, "new value 1");
+ meta.add("contenttype", "new value 2");
+ values = meta.getValues(CONTENTTYPE);
+ Assert.assertEquals(2, values.length);
+ Assert.assertEquals("new value 1", values[0]);
+ Assert.assertEquals("new value 2", values[1]);
+ }
+
+ /** Test for <code>setAll(Properties)</code> method. */
+ @Test
+ public void testSetProperties() {
+ String[] values = null;
+ Metadata meta = new Metadata();
+ Properties props = new Properties();
+
+ meta.setAll(props);
+ Assert.assertEquals(0, meta.size());
+
+ props.setProperty("name-one", "value1.1");
+ meta.setAll(props);
+ Assert.assertEquals(1, meta.size());
+ values = meta.getValues("name-one");
+ Assert.assertEquals(1, values.length);
+ Assert.assertEquals("value1.1", values[0]);
+
+ props.setProperty("name-two", "value2.1");
+ meta.setAll(props);
+ Assert.assertEquals(2, meta.size());
+ values = meta.getValues("name-one");
+ Assert.assertEquals(1, values.length);
+ Assert.assertEquals("value1.1", values[0]);
+ values = meta.getValues("name-two");
+ Assert.assertEquals(1, values.length);
+ Assert.assertEquals("value2.1", values[0]);
+ }
+
+ /** Test for <code>get(String)</code> method. */
+ @Test
+ public void testGet() {
+ Metadata meta = new Metadata();
+ Assert.assertNull(meta.get("a-name"));
+ meta.add("a-name", "value-1");
+ Assert.assertEquals("value-1", meta.get("a-name"));
+ meta.add("a-name", "value-2");
+ Assert.assertEquals("value-1", meta.get("a-name"));
+ }
+
+ /** Test for <code>isMultiValued()</code> method. */
+ @Test
+ public void testIsMultiValued() {
+ Metadata meta = new Metadata();
+ Assert.assertFalse(meta.isMultiValued("key"));
+ meta.add("key", "value1");
+ Assert.assertFalse(meta.isMultiValued("key"));
+ meta.add("key", "value2");
+ Assert.assertTrue(meta.isMultiValued("key"));
+ }
+
+ /** Test for <code>names</code> method. */
+ @Test
+ public void testNames() {
+ String[] names = null;
+ Metadata meta = new Metadata();
+ names = meta.names();
+ Assert.assertEquals(0, names.length);
+
+ meta.add("name-one", "value");
+ names = meta.names();
+ Assert.assertEquals(1, names.length);
+ Assert.assertEquals("name-one", names[0]);
+ meta.add("name-two", "value");
+ names = meta.names();
+ Assert.assertEquals(2, names.length);
+ }
+
+ /** Test for <code>remove(String)</code> method. */
+ @Test
+ public void testRemove() {
+ Metadata meta = new Metadata();
+ meta.remove("name-one");
+ Assert.assertEquals(0, meta.size());
+ meta.add("name-one", "value-1.1");
+ meta.add("name-one", "value-1.2");
+ meta.add("name-two", "value-2.2");
+ Assert.assertEquals(2, meta.size());
+ Assert.assertNotNull(meta.get("name-one"));
+ Assert.assertNotNull(meta.get("name-two"));
+ meta.remove("name-one");
+ Assert.assertEquals(1, meta.size());
+ Assert.assertNull(meta.get("name-one"));
+ Assert.assertNotNull(meta.get("name-two"));
+ meta.remove("name-two");
+ Assert.assertEquals(0, meta.size());
+ Assert.assertNull(meta.get("name-one"));
+ Assert.assertNull(meta.get("name-two"));
+ }
+
+ /** Test for <code>equals(Object)</code> method. */
+ @Test
+ public void testObject() {
+ Metadata meta1 = new Metadata();
+ Metadata meta2 = new Metadata();
+ Assert.assertFalse(meta1.equals(null));
+ Assert.assertFalse(meta1.equals("String"));
+ Assert.assertTrue(meta1.equals(meta2));
+ meta1.add("name-one", "value-1.1");
+ Assert.assertFalse(meta1.equals(meta2));
+ meta2.add("name-one", "value-1.1");
+ Assert.assertTrue(meta1.equals(meta2));
+ meta1.add("name-one", "value-1.2");
+ Assert.assertFalse(meta1.equals(meta2));
+ meta2.add("name-one", "value-1.2");
+ Assert.assertTrue(meta1.equals(meta2));
+ meta1.add("name-two", "value-2.1");
+ Assert.assertFalse(meta1.equals(meta2));
+ meta2.add("name-two", "value-2.1");
+ Assert.assertTrue(meta1.equals(meta2));
+ meta1.add("name-two", "value-2.2");
+ Assert.assertFalse(meta1.equals(meta2));
+ meta2.add("name-two", "value-2.x");
+ Assert.assertFalse(meta1.equals(meta2));
+ }
+
+ /** Test for <code>Writable</code> implementation. */
+ @Test
+ public void testWritable() {
+ Metadata result = null;
+ Metadata meta = new Metadata();
+ result = writeRead(meta);
+ Assert.assertEquals(0, result.size());
+ meta.add("name-one", "value-1.1");
+ result = writeRead(meta);
+ Assert.assertEquals(1, result.size());
+ Assert.assertEquals(1, result.getValues("name-one").length);
+ Assert.assertEquals("value-1.1", result.get("name-one"));
+ meta.add("name-two", "value-2.1");
+ meta.add("name-two", "value-2.2");
+ result = writeRead(meta);
+ Assert.assertEquals(2, result.size());
+ Assert.assertEquals(1, result.getValues("name-one").length);
+ Assert.assertEquals("value-1.1", result.getValues("name-one")[0]);
+ Assert.assertEquals(2, result.getValues("name-two").length);
+ Assert.assertEquals("value-2.1", result.getValues("name-two")[0]);
+ Assert.assertEquals("value-2.2", result.getValues("name-two")[1]);
+ }
+
+ private Metadata writeRead(Metadata meta) {
+ Metadata readed = new Metadata();
+ try {
+ ByteArrayOutputStream out = new ByteArrayOutputStream();
+ meta.write(new DataOutputStream(out));
+ readed.readFields(new DataInputStream(new ByteArrayInputStream(out
+ .toByteArray())));
+ } catch (IOException ioe) {
+ Assert.fail(ioe.toString());
+ }
+ return readed;
+ }
+
+}