You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by th...@apache.org on 2016/07/16 19:48:45 UTC

[29/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build for nutch-core and nutch-plugins

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbStates.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbStates.java b/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbStates.java
new file mode 100644
index 0000000..b631319
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbStates.java
@@ -0,0 +1,569 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import java.util.ArrayList;
+import java.util.Date;
+import java.util.Iterator;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.util.StringUtils;
+
+import org.apache.nutch.crawl.CrawlDatum;
+
+import static org.apache.nutch.crawl.CrawlDatum.*;
+
+import org.apache.nutch.scoring.ScoringFilterException;
+import org.apache.nutch.scoring.ScoringFilters;
+
+import static org.junit.Assert.*;
+
+import org.apache.nutch.test.IntegrationTest;
+import org.junit.Test;
+
+import org.junit.experimental.categories.Category;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Test transitions of {@link CrawlDatum} states during an update of
+ * {@link CrawlDb} (command {@literal updatedb}):
+ * <ul>
+ * <li>simulate updatedb with the old CrawlDatum (db status) and the new one
+ * (fetch status) and test whether the resulting CrawlDatum has the appropriate
+ * status.</li>
+ * <li>also check for further CrawlDatum fields (signature, etc.)</li>
+ * <li>and additional conditions:</li>
+ * <ul>
+ * <li>retry counters</li>
+ * <li>signatures</li>
+ * <li>configuration properties</li>
+ * <li>(additional) CrawlDatums of status linked (stemming from inlinks)</li>
+ * </ul>
+ * </li> </ul>
+ */
+@Category({IntegrationTest.class})
+public class TestCrawlDbStates {
+
+  private static final Logger LOG = LoggerFactory
+      .getLogger(TestCrawlDbStates.class);
+
+  protected static final byte[][] fetchDbStatusPairs = {
+      { -1, STATUS_DB_UNFETCHED }, { STATUS_FETCH_SUCCESS, STATUS_DB_FETCHED },
+      { STATUS_FETCH_GONE, STATUS_DB_GONE },
+      { STATUS_FETCH_REDIR_TEMP, STATUS_DB_REDIR_TEMP },
+      { STATUS_FETCH_REDIR_PERM, STATUS_DB_REDIR_PERM },
+      { STATUS_FETCH_NOTMODIFIED, STATUS_DB_NOTMODIFIED },
+      { STATUS_FETCH_RETRY, -1 }, // fetch_retry does not have a CrawlDb
+                                  // counter-part
+      { -1, STATUS_DB_DUPLICATE }, };
+
+  /** tested {@link FetchSchedule} implementations */
+  protected String[] schedules = { "DefaultFetchSchedule",
+      "AdaptiveFetchSchedule" };
+
+  /** CrawlDatum as result of a link */
+  protected final CrawlDatum linked = new CrawlDatum(STATUS_LINKED,
+      CrawlDBTestUtil.createConfiguration().getInt("db.fetch.interval.default",
+          2592000), 0.1f);
+
+  /**
+   * Test the matrix of state transitions:
+   * <ul>
+   * <li>for all available {@link FetchSchedule} implementations</li>
+   * <li>for every possible status in CrawlDb (including "not in CrawlDb")</li>
+   * <li>for every possible fetch status</li>
+   * <li>and zero or more (0-3) additional in-links</li>
+   * </ul>
+   * call {@literal updatedb} and check whether the resulting CrawlDb status is
+   * the expected one.
+   */
+  @Test
+  public void testCrawlDbStateTransitionMatrix() {
+    LOG.info("Test CrawlDatum state transitions");
+    Configuration conf = CrawlDBTestUtil.createConfiguration();
+    CrawlDbUpdateUtil<CrawlDbReducer> updateDb = new CrawlDbUpdateUtil<CrawlDbReducer>(
+        new CrawlDbReducer(), conf);
+    int retryMax = conf.getInt("db.fetch.retry.max", 3);
+    for (String sched : schedules) {
+      LOG.info("Testing state transitions with " + sched);
+      conf.set("db.fetch.schedule.class", "org.apache.nutch.crawl." + sched);
+      FetchSchedule schedule = FetchScheduleFactory
+          .getFetchSchedule(new JobConf(conf));
+      for (int i = 0; i < fetchDbStatusPairs.length; i++) {
+        byte fromDbStatus = fetchDbStatusPairs[i][1];
+        for (int j = 0; j < fetchDbStatusPairs.length; j++) {
+          byte fetchStatus = fetchDbStatusPairs[j][0];
+          CrawlDatum fromDb = null;
+          if (fromDbStatus == -1) {
+            // nothing yet in CrawlDb
+            // CrawlDatum added by FreeGenerator or via outlink
+          } else {
+            fromDb = new CrawlDatum();
+            fromDb.setStatus(fromDbStatus);
+            // initialize fetchInterval:
+            schedule.initializeSchedule(CrawlDbUpdateUtil.dummyURL, fromDb);
+          }
+          // expected db status
+          byte toDbStatus = fetchDbStatusPairs[j][1];
+          if (fetchStatus == -1) {
+            if (fromDbStatus == -1) {
+              // nothing fetched yet: new document detected via outlink
+              toDbStatus = STATUS_DB_UNFETCHED;
+            } else {
+              // nothing fetched but new inlinks detected: status is unchanged
+              toDbStatus = fromDbStatus;
+            }
+          } else if (fetchStatus == STATUS_FETCH_RETRY) {
+            // a simple test of fetch_retry (without retries)
+            if (fromDb == null || fromDb.getRetriesSinceFetch() < retryMax) {
+              toDbStatus = STATUS_DB_UNFETCHED;
+            } else {
+              toDbStatus = STATUS_DB_GONE;
+            }
+          }
+          String fromDbStatusName = (fromDbStatus == -1 ? "<not in CrawlDb>"
+              : getStatusName(fromDbStatus));
+          String fetchStatusName = (fetchStatus == -1 ? "<only inlinks>"
+              : CrawlDatum.getStatusName(fetchStatus));
+          LOG.info(fromDbStatusName + " + " + fetchStatusName + " => "
+              + getStatusName(toDbStatus));
+          List<CrawlDatum> values = new ArrayList<CrawlDatum>();
+          for (int l = 0; l <= 2; l++) { // number of additional in-links
+            CrawlDatum fetch = null;
+            if (fetchStatus == -1) {
+              // nothing fetched, need at least one in-link
+              if (l == 0)
+                continue;
+            } else {
+              fetch = new CrawlDatum();
+              if (fromDb != null) {
+                fetch.set(fromDb);
+              } else {
+                // not yet in CrawlDb: added by FreeGenerator
+                schedule.initializeSchedule(CrawlDbUpdateUtil.dummyURL, fetch);
+              }
+              fetch.setStatus(fetchStatus);
+              fetch.setFetchTime(System.currentTimeMillis());
+            }
+            if (fromDb != null)
+              values.add(fromDb);
+            if (fetch != null)
+              values.add(fetch);
+            for (int n = 0; n < l; n++) {
+              values.add(linked);
+            }
+            List<CrawlDatum> res = updateDb.update(values);
+            if (res.size() != 1) {
+              fail("CrawlDb update didn't result in one single CrawlDatum per URL");
+              continue;
+            }
+            byte status = res.get(0).getStatus();
+            if (status != toDbStatus) {
+              fail("CrawlDb update for " + fromDbStatusName + " and "
+                  + fetchStatusName + " and " + l + " inlinks results in "
+                  + getStatusName(status) + " (expected: "
+                  + getStatusName(toDbStatus) + ")");
+            }
+            values.clear();
+          }
+        }
+      }
+    }
+  }
+
+  /**
+   * Test states after inject: inject must not modify the status of CrawlDatums
+   * already in CrawlDb. Newly injected elements have status "db_unfetched".
+   * Inject is simulated by calling {@link Injector.InjectReducer#reduce()}.
+   */
+  @Test
+  public void testCrawlDbStatTransitionInject() {
+    LOG.info("Test CrawlDatum states in Injector after inject");
+    Configuration conf = CrawlDBTestUtil.createConfiguration();
+    Injector.InjectReducer injector = new Injector.InjectReducer();
+    CrawlDbUpdateTestDriver<Injector.InjectReducer> injectDriver =
+        new CrawlDbUpdateTestDriver<Injector.InjectReducer>(injector, conf);
+    ScoringFilters scfilters = new ScoringFilters(conf);
+    for (String sched : schedules) {
+      LOG.info("Testing inject with " + sched);
+      conf.set("db.fetch.schedule.class", "org.apache.nutch.crawl." + sched);
+      FetchSchedule schedule = FetchScheduleFactory
+          .getFetchSchedule(new JobConf(conf));
+      List<CrawlDatum> values = new ArrayList<CrawlDatum>();
+      for (int i = 0; i < fetchDbStatusPairs.length; i++) {
+        byte fromDbStatus = fetchDbStatusPairs[i][1];
+        byte toDbStatus = fromDbStatus;
+        if (fromDbStatus == -1) {
+          toDbStatus = STATUS_DB_UNFETCHED;
+        } else {
+          CrawlDatum fromDb = new CrawlDatum();
+          fromDb.setStatus(fromDbStatus);
+          schedule.initializeSchedule(CrawlDbUpdateUtil.dummyURL, fromDb);
+          values.add(fromDb);
+        }
+        LOG.info("inject "
+            + (fromDbStatus == -1 ? "<not in CrawlDb>" : CrawlDatum
+                .getStatusName(fromDbStatus)) + " + "
+            + getStatusName(STATUS_INJECTED) + " => "
+            + getStatusName(toDbStatus));
+        CrawlDatum injected = new CrawlDatum(STATUS_INJECTED, conf.getInt(
+            "db.fetch.interval.default", 2592000), 0.1f);
+        schedule.initializeSchedule(CrawlDbUpdateUtil.dummyURL, injected);
+        try {
+          scfilters.injectedScore(CrawlDbUpdateUtil.dummyURL, injected);
+        } catch (ScoringFilterException e) {
+          LOG.error(StringUtils.stringifyException(e));
+        }
+        values.add(injected);
+        List<CrawlDatum> res = injectDriver.update(values);
+        if (res.size() != 1) {
+          fail("Inject didn't result in one single CrawlDatum per URL");
+          continue;
+        }
+        byte status = res.get(0).getStatus();
+        if (status != toDbStatus) {
+          fail("Inject for "
+              + (fromDbStatus == -1 ? "" : getStatusName(fromDbStatus)
+                  + " and ") + getStatusName(STATUS_INJECTED) + " results in "
+              + getStatusName(status) + " (expected: "
+              + getStatusName(toDbStatus) + ")");
+        }
+        values.clear();
+      }
+    }
+  }
+
+  /**
+   * Test status db_notmodified detected by
+   * <ul>
+   * <li>signature comparison</li>
+   * <li>or HTTP 304</li>
+   * </ul>
+   * In addition, test for all available {@link FetchSchedule} implementations
+   * whether
+   * <ul>
+   * <li>modified time is set</li>
+   * <li>re-fetch is triggered after a certain time to force the fetched content
+   * to be in a recent segment (old segments are deleted, see comments in
+   * {@link CrawlDbReducer#reduce(Text, Iterator, OutputCollector, Reporter)}</li>
+   * </ul>
+   */
+  @Test
+  public void testCrawlDbReducerNotModified() {
+    LOG.info("Test state notmodified");
+    Configuration conf = CrawlDBTestUtil.createConfiguration();
+    // test not modified detected by signature comparison
+    for (String sched : schedules) {
+      String desc = "test notmodified by signature comparison + " + sched;
+      LOG.info(desc);
+      conf.set("db.fetch.schedule.class", "org.apache.nutch.crawl." + sched);
+      ContinuousCrawlTestUtil crawlUtil = new CrawlTestFetchNotModified(conf);
+      if (!crawlUtil.run(20)) {
+        fail("failed: " + desc);
+      }
+    }
+    // test not modified detected by HTTP 304
+    for (String sched : schedules) {
+      String desc = "test notmodified by HTTP 304 + " + sched;
+      LOG.info(desc);
+      conf.set("db.fetch.schedule.class", "org.apache.nutch.crawl." + sched);
+      ContinuousCrawlTestUtil crawlUtil = new CrawlTestFetchNotModifiedHttp304(
+          conf);
+      if (!crawlUtil.run(20)) {
+        fail("failed: " + desc);
+      }
+    }
+  }
+
+  protected class CrawlTestFetchNotModified extends ContinuousCrawlTestUtil {
+
+    /** time of the current fetch */
+    protected long currFetchTime;
+    /** time the last fetch took place */
+    protected long lastFetchTime;
+    /**
+     * time the document was fetched first (at all or after it has been changed)
+     */
+    protected long firstFetchTime;
+    /** state in CrawlDb before the last fetch */
+    protected byte previousDbState;
+    /** signature in CrawlDb of previous fetch */
+    protected byte[] lastSignature;
+
+    private long maxFetchInterval;
+    private FetchSchedule schedule;
+
+    CrawlTestFetchNotModified(Configuration conf) {
+      super(conf);
+      maxFetchInterval = conf.getLong("db.fetch.interval.max", 7776000); // default
+                                                                         // = 90
+                                                                         // days
+      maxFetchInterval += (24 * 60 * 60); // but take one day more to avoid
+                                          // false alarms
+      maxFetchInterval *= 1000; // in milli-seconds
+      schedule = FetchScheduleFactory.getFetchSchedule(new JobConf(conf));
+    }
+
+    @Override
+    protected boolean check(CrawlDatum result) {
+      if (lastFetchTime > 0
+          && (currFetchTime - lastFetchTime) > maxFetchInterval) {
+        LOG.error("last effective fetch (HTTP 200, not HTTP 304), at "
+            + new Date(lastFetchTime)
+            + ", took place more than db.fetch.interval.max time, "
+            + "segment containing fetched content may have been deleted");
+        return false;
+      }
+      switch (result.getStatus()) {
+      case STATUS_DB_NOTMODIFIED:
+        // db_notmodified is correct if the document has been fetched previously
+        // and it has not been changed since
+        if ((previousDbState == STATUS_DB_FETCHED || previousDbState == STATUS_DB_NOTMODIFIED)) {
+          if (lastSignature != null
+              && result.getSignature() != null
+              && SignatureComparator._compare(lastSignature,
+                  result.getSignature()) != 0) {
+            LOG.error("document has changed (signature changed) but state is still "
+                + getStatusName(STATUS_DB_NOTMODIFIED));
+            return false;
+          }
+          LOG.info("ok: " + result);
+          return checkModifiedTime(result, firstFetchTime);
+        }
+        LOG.warn("notmodified without previous fetch");
+        break;
+      case STATUS_DB_FETCHED:
+        if (previousDbState == STATUS_DB_UNFETCHED) {
+          LOG.info("ok (first fetch): " + result);
+          return checkModifiedTime(result, firstFetchTime);
+        } else if (lastSignature != null
+            && result.getSignature() != null
+            && SignatureComparator._compare(lastSignature,
+                result.getSignature()) != 0) {
+          LOG.info("ok (content changed): " + result);
+          // expect modified time == now
+          return checkModifiedTime(result, currFetchTime);
+        } else {
+          LOG.warn("document has not changed, db_notmodified expected");
+        }
+        break;
+      case STATUS_DB_UNFETCHED:
+        /**
+         * Status db_unfetched is possible with {@link AdaptiveFetchSchedule}
+         * because {@link CrawlDbReducer#reduce} calls
+         * {@link FetchSchedule#forceRefetch} to force a re-fetch if fetch
+         * interval grows too large.
+         */
+        if (schedule.getClass() == AdaptiveFetchSchedule.class) {
+          LOG.info("state set to unfetched by AdaptiveFetchSchedule");
+          if (result.getSignature() != null) {
+            LOG.warn("must reset signature: " + result);
+            return false;
+          }
+          LOG.info("ok: " + result);
+          firstFetchTime = 0;
+          return true;
+        }
+      }
+      LOG.warn("wrong result: " + result);
+      return false;
+    }
+
+    // test modified time
+    private boolean checkModifiedTime(CrawlDatum result, long modifiedTime) {
+      if (result.getModifiedTime() == 0) {
+        LOG.error("modified time not set (TODO: not set by DefaultFetchSchedule)");
+        // TODO: return false (but DefaultFetchSchedule does not set modified
+        // time, see NUTCH-933)
+        return true;
+      } else if (modifiedTime == result.getModifiedTime()) {
+        return true;
+      }
+      LOG.error("wrong modified time: " + new Date(result.getModifiedTime())
+          + " (expected " + new Date(modifiedTime) + ")");
+      return false;
+    }
+
+    @Override
+    protected CrawlDatum fetch(CrawlDatum datum, long currentTime) {
+      lastFetchTime = currFetchTime;
+      currFetchTime = currentTime;
+      previousDbState = datum.getStatus();
+      lastSignature = datum.getSignature();
+      datum = super.fetch(datum, currentTime);
+      if (firstFetchTime == 0) {
+        firstFetchTime = currFetchTime;
+      } else if ((currFetchTime - firstFetchTime) > (duration / 2)) {
+        // simulate a modification after "one year"
+        changeContent();
+        firstFetchTime = currFetchTime;
+      }
+      return datum;
+    }
+  }
+
+  protected class CrawlTestFetchNotModifiedHttp304 extends
+      CrawlTestFetchNotModified {
+
+    CrawlTestFetchNotModifiedHttp304(Configuration conf) {
+      super(conf);
+    }
+
+    @Override
+    protected CrawlDatum fetch(CrawlDatum datum, long currentTime) {
+      lastFetchTime = currFetchTime;
+      currFetchTime = currentTime;
+      previousDbState = datum.getStatus();
+      lastSignature = datum.getSignature();
+      int httpCode;
+      /*
+       * document is "really" fetched (no HTTP 304) - if last-modified time or
+       * signature are unset (page has not been fetched before or fetch is
+       * forced) - for test purposes, we simulate a modified after "one year"
+       */
+      if (datum.getModifiedTime() == 0 && datum.getSignature() == null
+          || (currFetchTime - firstFetchTime) > (duration / 2)) {
+        firstFetchTime = currFetchTime;
+        httpCode = 200;
+        datum.setStatus(STATUS_FETCH_SUCCESS);
+        // modify content to change signature
+        changeContent();
+      } else {
+        httpCode = 304;
+        datum.setStatus(STATUS_FETCH_NOTMODIFIED);
+      }
+      LOG.info("fetched with HTTP " + httpCode + " => "
+          + getStatusName(datum.getStatus()));
+      datum.setFetchTime(currentTime);
+      return datum;
+    }
+  }
+
+  /**
+   * NUTCH-1245: a fetch_gone should always result in a db_gone.
+   * <p>
+   * Even in a long-running continuous crawl, when a gone page is re-fetched
+   * several times over time.
+   * </p>
+   */
+  @Test
+  public void testCrawlDbReducerPageGoneSchedule1() {
+    LOG.info("NUTCH-1245: test long running continuous crawl");
+    ContinuousCrawlTestUtil crawlUtil = new ContinuousCrawlTestUtil(
+        STATUS_FETCH_GONE, STATUS_DB_GONE);
+    if (!crawlUtil.run(20)) {
+      fail("fetch_gone did not result in a db_gone (NUTCH-1245)");
+    }
+  }
+
+  /**
+   * NUTCH-1245: a fetch_gone should always result in a db_gone.
+   * <p>
+   * As some kind of misconfiguration set db.fetch.interval.default to a value
+   * &gt; (fetchIntervalMax * 1.5).
+   * </p>
+   */
+  @Test
+  public void testCrawlDbReducerPageGoneSchedule2() {
+    LOG.info("NUTCH-1245 (misconfiguration): test with db.fetch.interval.default > (1.5 * db.fetch.interval.max)");
+    Configuration conf = CrawlDBTestUtil.createConfiguration();
+    int fetchIntervalMax = conf.getInt("db.fetch.interval.max", 0);
+    conf.setInt("db.fetch.interval.default", 3 + (int) (fetchIntervalMax * 1.5));
+    ContinuousCrawlTestUtil crawlUtil = new ContinuousCrawlTestUtil(conf,
+        STATUS_FETCH_GONE, STATUS_DB_GONE);
+    if (!crawlUtil.run(0)) {
+      fail("fetch_gone did not result in a db_gone (NUTCH-1245)");
+    }
+  }
+
+  /**
+   * Test whether signatures are reset for "content-less" states (gone,
+   * redirect, etc.): otherwise, if this state is temporary and the document
+   * appears again with the old content, it may get marked as not_modified in
+   * CrawlDb just after the redirect state. In this case we cannot expect
+   * content in segments. Cf. NUTCH-1422: reset signature for redirects.
+   */
+  // TODO: can only test if solution is done in CrawlDbReducer
+  @Test
+  public void testSignatureReset() {
+    LOG.info("NUTCH-1422 must reset signature for redirects and similar states");
+    Configuration conf = CrawlDBTestUtil.createConfiguration();
+    for (String sched : schedules) {
+      LOG.info("Testing reset signature with " + sched);
+      conf.set("db.fetch.schedule.class", "org.apache.nutch.crawl." + sched);
+      ContinuousCrawlTestUtil crawlUtil = new CrawlTestSignatureReset(conf);
+      if (!crawlUtil.run(20)) {
+        fail("failed: signature not reset");
+      }
+    }
+  }
+
+  private class CrawlTestSignatureReset extends ContinuousCrawlTestUtil {
+
+    byte[][] noContentStates = { { STATUS_FETCH_GONE, STATUS_DB_GONE },
+        { STATUS_FETCH_REDIR_TEMP, STATUS_DB_REDIR_TEMP },
+        { STATUS_FETCH_REDIR_PERM, STATUS_DB_REDIR_PERM } };
+
+    int counter = 0;
+    byte fetchState;
+
+    public CrawlTestSignatureReset(Configuration conf) {
+      super(conf);
+    }
+
+    @Override
+    protected CrawlDatum fetch(CrawlDatum datum, long currentTime) {
+      datum = super.fetch(datum, currentTime);
+      counter++;
+      // flip-flopping between successful fetch and one of content-less states
+      if (counter % 2 == 1) {
+        fetchState = STATUS_FETCH_SUCCESS;
+      } else {
+        fetchState = noContentStates[(counter % 6) / 2][0];
+      }
+      LOG.info("Step " + counter + ": fetched with "
+          + getStatusName(fetchState));
+      datum.setStatus(fetchState);
+      return datum;
+    }
+
+    @Override
+    protected boolean check(CrawlDatum result) {
+      if (result.getStatus() == STATUS_DB_NOTMODIFIED
+          && !(fetchState == STATUS_FETCH_SUCCESS || fetchState == STATUS_FETCH_NOTMODIFIED)) {
+        LOG.error("Should never get into state "
+            + getStatusName(STATUS_DB_NOTMODIFIED) + " from "
+            + getStatusName(fetchState));
+        return false;
+      }
+      if (result.getSignature() != null
+          && !(result.getStatus() == STATUS_DB_FETCHED || result.getStatus() == STATUS_DB_NOTMODIFIED)) {
+        LOG.error("Signature not reset in state "
+            + getStatusName(result.getStatus()));
+        // ok here: since it's not the problem itself (the db_notmodified), but
+        // the reason for it
+      }
+      return true;
+    }
+
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/crawl/TestGenerator.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/crawl/TestGenerator.java b/nutch-core/src/test/java/org/apache/nutch/crawl/TestGenerator.java
new file mode 100644
index 0000000..0ce3c5f
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/crawl/TestGenerator.java
@@ -0,0 +1,373 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.crawl;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.SequenceFile.Reader.Option;
+import org.apache.nutch.crawl.CrawlDBTestUtil.URLCrawlDatum;
+import org.apache.nutch.test.IntegrationTest;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+
+/**
+ * Basic generator test. 1. Insert entries in crawldb 2. Generates entries to
+ * fetch 3. Verifies that number of generated urls match 4. Verifies that
+ * highest scoring urls are generated
+ * 
+ */
+@Category({IntegrationTest.class})
+public class TestGenerator {
+
+  Configuration conf;
+
+  Path dbDir;
+
+  Path segmentsDir;
+
+  FileSystem fs;
+
+  final static Path testdir = new Path("build/test/generator-test");
+
+  @Before
+  public void setUp() throws Exception {
+    conf = CrawlDBTestUtil.createConfiguration();
+    fs = FileSystem.get(conf);
+    fs.delete(testdir, true);
+  }
+
+  @After
+  public void tearDown() {
+    delete(testdir);
+  }
+
+  private void delete(Path p) {
+    try {
+      fs.delete(p, true);
+    } catch (IOException e) {
+    }
+  }
+
+  /**
+   * Test that generator generates fetchlish ordered by score (desc).
+   * 
+   * @throws Exception
+   */
+  @Test
+  public void testGenerateHighest() throws Exception {
+
+    final int NUM_RESULTS = 2;
+
+    ArrayList<URLCrawlDatum> list = new ArrayList<URLCrawlDatum>();
+
+    for (int i = 0; i <= 100; i++) {
+      list.add(createURLCrawlDatum("http://aaa/" + pad(i), 1, i));
+    }
+
+    createCrawlDB(list);
+
+    Path generatedSegment = generateFetchlist(NUM_RESULTS, conf, false);
+
+    Path fetchlist = new Path(new Path(generatedSegment,
+        CrawlDatum.GENERATE_DIR_NAME), "part-00000");
+
+    ArrayList<URLCrawlDatum> l = readContents(fetchlist);
+
+    // sort urls by score desc
+    Collections.sort(l, new ScoreComparator());
+
+    // verify we got right amount of records
+    Assert.assertEquals(NUM_RESULTS, l.size());
+
+    // verify we have the highest scoring urls
+    Assert.assertEquals("http://aaa/100", (l.get(0).url.toString()));
+    Assert.assertEquals("http://aaa/099", (l.get(1).url.toString()));
+  }
+
+  private String pad(int i) {
+    String s = Integer.toString(i);
+    while (s.length() < 3) {
+      s = "0" + s;
+    }
+    return s;
+  }
+
+  /**
+   * Comparator that sorts by score desc.
+   */
+  public class ScoreComparator implements Comparator<URLCrawlDatum> {
+
+    public int compare(URLCrawlDatum tuple1, URLCrawlDatum tuple2) {
+      if (tuple2.datum.getScore() - tuple1.datum.getScore() < 0) {
+        return -1;
+      }
+      if (tuple2.datum.getScore() - tuple1.datum.getScore() > 0) {
+        return 1;
+      }
+      return 0;
+    }
+  }
+
+  /**
+   * Test that generator obeys the property "generate.max.per.host".
+   * 
+   * @throws Exception
+   */
+  @Test
+  public void testGenerateHostLimit() throws Exception {
+    ArrayList<URLCrawlDatum> list = new ArrayList<URLCrawlDatum>();
+
+    list.add(createURLCrawlDatum("http://www.example.com/index1.html", 1, 1));
+    list.add(createURLCrawlDatum("http://www.example.com/index2.html", 1, 1));
+    list.add(createURLCrawlDatum("http://www.example.com/index3.html", 1, 1));
+
+    createCrawlDB(list);
+
+    Configuration myConfiguration = new Configuration(conf);
+    myConfiguration.setInt(Generator.GENERATOR_MAX_COUNT, 2);
+    Path generatedSegment = generateFetchlist(Integer.MAX_VALUE,
+        myConfiguration, false);
+
+    Path fetchlistPath = new Path(new Path(generatedSegment,
+        CrawlDatum.GENERATE_DIR_NAME), "part-00000");
+
+    ArrayList<URLCrawlDatum> fetchList = readContents(fetchlistPath);
+
+    // verify we got right amount of records
+    Assert.assertEquals(1, fetchList.size());
+
+    myConfiguration = new Configuration(conf);
+    myConfiguration.setInt(Generator.GENERATOR_MAX_COUNT, 3);
+    generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration,
+        false);
+
+    fetchlistPath = new Path(new Path(generatedSegment,
+        CrawlDatum.GENERATE_DIR_NAME), "part-00000");
+
+    fetchList = readContents(fetchlistPath);
+
+    // verify we got right amount of records
+    Assert.assertEquals(2, fetchList.size());
+
+    myConfiguration = new Configuration(conf);
+    myConfiguration.setInt(Generator.GENERATOR_MAX_COUNT, 4);
+    generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration,
+        false);
+
+    fetchlistPath = new Path(new Path(generatedSegment,
+        CrawlDatum.GENERATE_DIR_NAME), "part-00000");
+
+    fetchList = readContents(fetchlistPath);
+
+    // verify we got right amount of records
+    Assert.assertEquals(3, fetchList.size());
+  }
+
+  /**
+   * Test that generator obeys the property "generator.max.count" and
+   * "generator.count.per.domain".
+   * 
+   * @throws Exception
+   */
+  @Test
+  public void testGenerateDomainLimit() throws Exception {
+    ArrayList<URLCrawlDatum> list = new ArrayList<URLCrawlDatum>();
+
+    list.add(createURLCrawlDatum("http://a.example.com/index.html", 1, 1));
+    list.add(createURLCrawlDatum("http://b.example.com/index.html", 1, 1));
+    list.add(createURLCrawlDatum("http://c.example.com/index.html", 1, 1));
+
+    createCrawlDB(list);
+
+    Configuration myConfiguration = new Configuration(conf);
+    myConfiguration.setInt(Generator.GENERATOR_MAX_COUNT, 2);
+    myConfiguration.set(Generator.GENERATOR_COUNT_MODE,
+        Generator.GENERATOR_COUNT_VALUE_DOMAIN);
+
+    Path generatedSegment = generateFetchlist(Integer.MAX_VALUE,
+        myConfiguration, false);
+
+    Path fetchlistPath = new Path(new Path(generatedSegment,
+        CrawlDatum.GENERATE_DIR_NAME), "part-00000");
+
+    ArrayList<URLCrawlDatum> fetchList = readContents(fetchlistPath);
+
+    // verify we got right amount of records
+    Assert.assertEquals(1, fetchList.size());
+
+    myConfiguration = new Configuration(myConfiguration);
+    myConfiguration.setInt(Generator.GENERATOR_MAX_COUNT, 3);
+    generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration,
+        false);
+
+    fetchlistPath = new Path(new Path(generatedSegment,
+        CrawlDatum.GENERATE_DIR_NAME), "part-00000");
+
+    fetchList = readContents(fetchlistPath);
+
+    // verify we got right amount of records
+    Assert.assertEquals(2, fetchList.size());
+
+    myConfiguration = new Configuration(myConfiguration);
+    myConfiguration.setInt(Generator.GENERATOR_MAX_COUNT, 4);
+    generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration,
+        false);
+
+    fetchlistPath = new Path(new Path(generatedSegment,
+        CrawlDatum.GENERATE_DIR_NAME), "part-00000");
+
+    fetchList = readContents(fetchlistPath);
+
+    // verify we got right amount of records
+    Assert.assertEquals(3, fetchList.size());
+  }
+
+  /**
+   * Test generator obeys the filter setting.
+   * 
+   * @throws Exception
+   * @throws IOException
+   */
+  @Test
+  public void testFilter() throws IOException, Exception {
+
+    ArrayList<URLCrawlDatum> list = new ArrayList<URLCrawlDatum>();
+
+    list.add(createURLCrawlDatum("http://www.example.com/index.html", 1, 1));
+    list.add(createURLCrawlDatum("http://www.example.net/index.html", 1, 1));
+    list.add(createURLCrawlDatum("http://www.example.org/index.html", 1, 1));
+
+    createCrawlDB(list);
+
+    Configuration myConfiguration = new Configuration(conf);
+    myConfiguration.set("urlfilter.suffix.file", "filter-all.txt");
+
+    Path generatedSegment = generateFetchlist(Integer.MAX_VALUE,
+        myConfiguration, true);
+
+    Assert.assertNull("should be null (0 entries)", generatedSegment);
+
+    generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration,
+        false);
+
+    Path fetchlistPath = new Path(new Path(generatedSegment,
+        CrawlDatum.GENERATE_DIR_NAME), "part-00000");
+
+    ArrayList<URLCrawlDatum> fetchList = readContents(fetchlistPath);
+
+    // verify nothing got filtered
+    Assert.assertEquals(list.size(), fetchList.size());
+
+  }
+
+  /**
+   * Read contents of fetchlist.
+   * 
+   * @param fetchlist
+   *          path to Generated fetchlist
+   * @return Generated {@link URLCrawlDatum} objects
+   * @throws IOException
+   */
+  private ArrayList<URLCrawlDatum> readContents(Path fetchlist)
+      throws IOException {
+    // verify results
+    Option rFile = SequenceFile.Reader.file(fetchlist);
+    SequenceFile.Reader reader = new SequenceFile.Reader(conf, rFile);
+
+    ArrayList<URLCrawlDatum> l = new ArrayList<URLCrawlDatum>();
+
+    READ: do {
+      Text key = new Text();
+      CrawlDatum value = new CrawlDatum();
+      if (!reader.next(key, value)) {
+        break READ;
+      }
+      l.add(new URLCrawlDatum(key, value));
+    } while (true);
+
+    reader.close();
+    return l;
+  }
+
+  /**
+   * Generate Fetchlist.
+   * 
+   * @param numResults
+   *          number of results to generate
+   * @param config
+   *          Configuration to use
+   * @return path to generated segment
+   * @throws IOException
+   */
+  private Path generateFetchlist(int numResults, Configuration config,
+      boolean filter) throws IOException {
+    // generate segment
+    Generator g = new Generator(config);
+    Path[] generatedSegment = g.generate(dbDir, segmentsDir, -1, numResults,
+        Long.MAX_VALUE, filter, false);
+    if (generatedSegment == null)
+      return null;
+    return generatedSegment[0];
+  }
+
+  /**
+   * Creates CrawlDB.
+   * 
+   * @param list
+   *          database contents
+   * @throws IOException
+   * @throws Exception
+   */
+  private void createCrawlDB(ArrayList<URLCrawlDatum> list) throws IOException,
+      Exception {
+    dbDir = new Path(testdir, "crawldb");
+    segmentsDir = new Path(testdir, "segments");
+    fs.mkdirs(dbDir);
+    fs.mkdirs(segmentsDir);
+
+    // create crawldb
+    CrawlDBTestUtil.createCrawlDb(conf, fs, dbDir, list);
+  }
+
+  /**
+   * Constructs new {@link URLCrawlDatum} from submitted parameters.
+   * 
+   * @param url
+   *          url to use
+   * @param fetchInterval
+   *          {@link CrawlDatum#setFetchInterval(float)}
+   * @param score
+   *          {@link CrawlDatum#setScore(float)}
+   * @return Constructed object
+   */
+  private URLCrawlDatum createURLCrawlDatum(final String url,
+      final int fetchInterval, final float score) {
+    return new CrawlDBTestUtil.URLCrawlDatum(new Text(url), new CrawlDatum(
+        CrawlDatum.STATUS_DB_UNFETCHED, fetchInterval, score));
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/crawl/TestInjector.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/crawl/TestInjector.java b/nutch-core/src/test/java/org/apache/nutch/crawl/TestInjector.java
new file mode 100644
index 0000000..59a3e8c
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/crawl/TestInjector.java
@@ -0,0 +1,184 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.crawl;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.SequenceFile.Reader.Option;
+import org.apache.nutch.test.IntegrationTest;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+
+/**
+ * Basic injector test: 1. Creates a text file with urls 2. Injects them into
+ * crawldb 3. Reads crawldb entries and verifies contents 4. Injects more urls
+ * into webdb 5. Reads crawldb entries and verifies contents
+ * 
+ */
+@Category({IntegrationTest.class})
+public class TestInjector {
+
+  private Configuration conf;
+  private FileSystem fs;
+  final static Path testdir = new Path("build/test/inject-test");
+  Path crawldbPath;
+  Path urlPath;
+
+  @Before
+  public void setUp() throws Exception {
+    conf = CrawlDBTestUtil.createConfiguration();
+    urlPath = new Path(testdir, "urls");
+    crawldbPath = new Path(testdir, "crawldb");
+    fs = FileSystem.get(conf);
+    if (fs.exists(urlPath))
+      fs.delete(urlPath, false);
+    if (fs.exists(crawldbPath))
+      fs.delete(crawldbPath, true);
+  }
+
+  @After
+  public void tearDown() throws IOException {
+    fs.delete(testdir, true);
+  }
+
+  @Test
+  public void testInject()
+      throws IOException, ClassNotFoundException, InterruptedException {
+    ArrayList<String> urls = new ArrayList<String>();
+    // We'll use a separate list for MD so we can still compare url with
+    // containsAll
+    ArrayList<String> metadata = new ArrayList<String>();
+    for (int i = 0; i < 100; i++) {
+      urls.add("http://zzz.com/" + i + ".html");
+      metadata.add("\tnutch.score=2." + i
+          + "\tnutch.fetchInterval=171717\tkey=value");
+    }
+    CrawlDBTestUtil.generateSeedList(fs, urlPath, urls, metadata);
+
+    Injector injector = new Injector(conf);
+    injector.inject(crawldbPath, urlPath);
+
+    // verify results
+    List<String> read = readCrawldb();
+
+    Collections.sort(read);
+    Collections.sort(urls);
+
+    Assert.assertEquals(urls.size(), read.size());
+
+    Assert.assertTrue(read.containsAll(urls));
+    Assert.assertTrue(urls.containsAll(read));
+
+    // inject more urls
+    ArrayList<String> urls2 = new ArrayList<String>();
+    for (int i = 0; i < 100; i++) {
+      urls2.add("http://xxx.com/" + i + ".html");
+      // We'll overwrite previously injected records but preserve their original
+      // MD
+      urls2.add("http://zzz.com/" + i + ".html");
+    }
+    CrawlDBTestUtil.generateSeedList(fs, urlPath, urls2);
+    injector = new Injector(conf);
+    conf.setBoolean("db.injector.update", true);
+    injector.inject(crawldbPath, urlPath);
+    urls.addAll(urls2);
+
+    // verify results
+    read = readCrawldb();
+
+    Collections.sort(read);
+    Collections.sort(urls);
+
+    // We should have 100 less records because we've overwritten
+    Assert.assertEquals(urls.size() - 100, read.size());
+
+    Assert.assertTrue(read.containsAll(urls));
+    Assert.assertTrue(urls.containsAll(read));
+
+    // Check if we correctly preserved MD
+    Map<String, CrawlDatum> records = readCrawldbRecords();
+
+    // Iterate over the urls, we're looking for http://zzz.com/ prefixed URLs
+    // so we can check for MD and score and interval
+    Text writableKey = new Text("key");
+    Text writableValue = new Text("value");
+    for (String url : urls) {
+      if (url.indexOf("http://zzz") == 0) {
+        // Check for fetch interval
+        Assert.assertTrue(records.get(url).getFetchInterval() == 171717);
+        // Check for default score
+        Assert.assertTrue(records.get(url).getScore() != 1.0);
+        // Check for MD key=value
+        Assert.assertEquals(writableValue,
+            records.get(url).getMetaData().get(writableKey));
+      }
+    }
+  }
+
+  private List<String> readCrawldb() throws IOException {
+    Path dbfile = new Path(crawldbPath, CrawlDb.CURRENT_NAME
+        + "/part-r-00000/data");
+    System.out.println("reading:" + dbfile);
+    Option rFile = SequenceFile.Reader.file(dbfile);
+    @SuppressWarnings("resource")
+    SequenceFile.Reader reader = new SequenceFile.Reader(conf, rFile);
+    ArrayList<String> read = new ArrayList<String>();
+
+    READ: do {
+      Text key = new Text();
+      CrawlDatum value = new CrawlDatum();
+      if (!reader.next(key, value))
+        break READ;
+      read.add(key.toString());
+    } while (true);
+
+    return read;
+  }
+
+  private HashMap<String, CrawlDatum> readCrawldbRecords() throws IOException {
+    Path dbfile = new Path(crawldbPath, CrawlDb.CURRENT_NAME
+        + "/part-r-00000/data");
+    System.out.println("reading:" + dbfile);
+    Option rFile = SequenceFile.Reader.file(dbfile);
+    @SuppressWarnings("resource")
+    SequenceFile.Reader reader = new SequenceFile.Reader(conf, rFile);
+    HashMap<String, CrawlDatum> read = new HashMap<String, CrawlDatum>();
+
+    READ: do {
+      Text key = new Text();
+      CrawlDatum value = new CrawlDatum();
+      if (!reader.next(key, value))
+        break READ;
+      read.put(key.toString(), value);
+    } while (true);
+
+    return read;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/crawl/TestLinkDbMerger.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/crawl/TestLinkDbMerger.java b/nutch-core/src/test/java/org/apache/nutch/crawl/TestLinkDbMerger.java
new file mode 100644
index 0000000..23aaa88
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/crawl/TestLinkDbMerger.java
@@ -0,0 +1,160 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.TreeMap;
+import java.util.logging.Logger;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.MapFile;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.MapFile.Writer.Option;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+public class TestLinkDbMerger {
+  private static final Logger LOG = Logger.getLogger(TestLinkDbMerger.class
+      .getName());
+
+  String url10 = "http://example.com/foo";
+  String[] urls10 = new String[] { "http://example.com/100",
+      "http://example.com/101" };
+
+  String url11 = "http://example.com/";
+  String[] urls11 = new String[] { "http://example.com/110",
+      "http://example.com/111" };
+
+  String url20 = "http://example.com/";
+  String[] urls20 = new String[] { "http://foo.com/200", "http://foo.com/201" };
+  String url21 = "http://example.com/bar";
+  String[] urls21 = new String[] { "http://foo.com/210", "http://foo.com/211" };
+
+  String[] urls10_expected = urls10;
+  String[] urls11_expected = new String[] { urls11[0], urls11[1], urls20[0],
+      urls20[1] };
+  String[] urls20_expected = urls11_expected;
+  String[] urls21_expected = urls21;
+
+  TreeMap<String, String[]> init1 = new TreeMap<String, String[]>();
+  TreeMap<String, String[]> init2 = new TreeMap<String, String[]>();
+  HashMap<String, String[]> expected = new HashMap<String, String[]>();
+  Configuration conf;
+  Path testDir;
+  FileSystem fs;
+  LinkDbReader reader;
+
+  @Before
+  public void setUp() throws Exception {
+    init1.put(url10, urls10);
+    init1.put(url11, urls11);
+    init2.put(url20, urls20);
+    init2.put(url21, urls21);
+    expected.put(url10, urls10_expected);
+    expected.put(url11, urls11_expected);
+    expected.put(url20, urls20_expected);
+    expected.put(url21, urls21_expected);
+    conf = NutchConfiguration.create();
+    fs = FileSystem.get(conf);
+    testDir = new Path("build/test/test-linkdb-"
+        + new java.util.Random().nextInt());
+    fs.mkdirs(testDir);
+  }
+
+  @After
+  public void tearDown() {
+    try {
+      if (fs.exists(testDir))
+        fs.delete(testDir, true);
+    } catch (Exception e) {
+    }
+    try {
+      reader.close();
+    } catch (Exception e) {
+    }
+  }
+
+  @Test
+  public void testMerge() throws Exception {
+    Configuration conf = NutchConfiguration.create();
+    FileSystem fs = FileSystem.get(conf);
+    fs.mkdirs(testDir);
+    Path linkdb1 = new Path(testDir, "linkdb1");
+    Path linkdb2 = new Path(testDir, "linkdb2");
+    Path output = new Path(testDir, "output");
+    createLinkDb(conf, fs, linkdb1, init1);
+    createLinkDb(conf, fs, linkdb2, init2);
+    LinkDbMerger merger = new LinkDbMerger(conf);
+    LOG.fine("* merging linkdbs to " + output);
+    merger.merge(output, new Path[] { linkdb1, linkdb2 }, false, false);
+    LOG.fine("* reading linkdb: " + output);
+    reader = new LinkDbReader(conf, output);
+    Iterator<String> it = expected.keySet().iterator();
+    while (it.hasNext()) {
+      String url = it.next();
+      LOG.fine("url=" + url);
+      String[] vals = expected.get(url);
+      Inlinks inlinks = reader.getInlinks(new Text(url));
+      // may not be null
+      Assert.assertNotNull(inlinks);
+      ArrayList<String> links = new ArrayList<String>();
+      Iterator<?> it2 = inlinks.iterator();
+      while (it2.hasNext()) {
+        Inlink in = (Inlink) it2.next();
+        links.add(in.getFromUrl());
+      }
+      for (int i = 0; i < vals.length; i++) {
+        LOG.fine(" -> " + vals[i]);
+        Assert.assertTrue(links.contains(vals[i]));
+      }
+    }
+    reader.close();
+    fs.delete(testDir, true);
+  }
+
+  private void createLinkDb(Configuration config, FileSystem fs, Path linkdb,
+      TreeMap<String, String[]> init) throws Exception {
+    LOG.fine("* creating linkdb: " + linkdb);
+    Path dir = new Path(linkdb, LinkDb.CURRENT_NAME);
+    
+    Option wKeyOpt = MapFile.Writer.keyClass(Text.class);
+    org.apache.hadoop.io.SequenceFile.Writer.Option wValueOpt = SequenceFile.Writer.valueClass(Inlinks.class);
+    MapFile.Writer writer = new MapFile.Writer(config, new Path(dir,
+        "part-00000"), wKeyOpt, wValueOpt);
+    Iterator<String> it = init.keySet().iterator();
+    while (it.hasNext()) {
+      String key = it.next();
+      Inlinks inlinks = new Inlinks();
+      String[] vals = init.get(key);
+      for (int i = 0; i < vals.length; i++) {
+        Inlink in = new Inlink(vals[i], vals[i]);
+        inlinks.add(in);
+      }
+      writer.append(new Text(key), inlinks);
+    }
+    writer.close();
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/crawl/TestSignatureFactory.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/crawl/TestSignatureFactory.java b/nutch-core/src/test/java/org/apache/nutch/crawl/TestSignatureFactory.java
new file mode 100644
index 0000000..db82d7a
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/crawl/TestSignatureFactory.java
@@ -0,0 +1,35 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.crawl;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Test;
+
+public class TestSignatureFactory {
+
+  @Test
+  public void testGetSignature() {
+    Configuration conf = NutchConfiguration.create();
+    Signature signature1 = SignatureFactory.getSignature(conf);
+    Signature signature2 = SignatureFactory.getSignature(conf);
+    Assert.assertNotNull(signature1);
+    Assert.assertNotNull(signature2);
+    Assert.assertEquals(signature1, signature2);
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/fetcher/TestFetcher.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/fetcher/TestFetcher.java b/nutch-core/src/test/java/org/apache/nutch/fetcher/TestFetcher.java
new file mode 100644
index 0000000..a23d080
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/fetcher/TestFetcher.java
@@ -0,0 +1,210 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.fetcher;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDBTestUtil;
+import org.apache.nutch.crawl.Generator;
+import org.apache.nutch.crawl.Injector;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.test.IntegrationTest;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+import org.mortbay.jetty.Server;
+
+/**
+ * Basic fetcher test 1. generate seedlist 2. inject 3. generate 3. fetch 4.
+ * Verify contents
+ * 
+ */
+public class TestFetcher {
+
+  final static Path testdir = new Path("build/test/fetch-test");
+  Configuration conf;
+  FileSystem fs;
+  Path crawldbPath;
+  Path segmentsPath;
+  Path urlPath;
+  Server server;
+
+  @Before
+  public void setUp() throws Exception {
+    conf = CrawlDBTestUtil.createConfiguration();
+    fs = FileSystem.get(conf);
+    fs.delete(testdir, true);
+    urlPath = new Path(testdir, "urls");
+    crawldbPath = new Path(testdir, "crawldb");
+    segmentsPath = new Path(testdir, "segments");
+    server = CrawlDBTestUtil.getServer(
+        conf.getInt("content.server.port", 50000),
+        "build/test/data/fetch-test-site");
+    server.start();
+  }
+
+  @After
+  public void tearDown() throws Exception {
+    server.stop();
+    for (int i = 0; i < 5; i++) {
+      if (!server.isStopped()) {
+       Thread.sleep(1000);
+      }
+    }
+    fs.delete(testdir, true);
+  }
+
+  @Test
+  @Category(IntegrationTest.class)
+  public void testFetch() throws IOException, ClassNotFoundException, InterruptedException {
+
+    // generate seedlist
+    ArrayList<String> urls = new ArrayList<String>();
+
+    addUrl(urls, "index.html");
+    addUrl(urls, "pagea.html");
+    addUrl(urls, "pageb.html");
+    addUrl(urls, "dup_of_pagea.html");
+    addUrl(urls, "nested_spider_trap.html");
+    addUrl(urls, "exception.html");
+
+    CrawlDBTestUtil.generateSeedList(fs, urlPath, urls);
+
+    // inject
+    Injector injector = new Injector(conf);
+    injector.inject(crawldbPath, urlPath);
+
+    // generate
+    Generator g = new Generator(conf);
+    Path[] generatedSegment = g.generate(crawldbPath, segmentsPath, 1,
+        Long.MAX_VALUE, Long.MAX_VALUE, false, false);
+
+    long time = System.currentTimeMillis();
+    // fetch
+    Fetcher fetcher = new Fetcher(conf);
+
+    // Set fetcher.parse to true
+    conf.setBoolean("fetcher.parse", true);
+
+    fetcher.fetch(generatedSegment[0], 1);
+
+    time = System.currentTimeMillis() - time;
+
+    // verify politeness, time taken should be more than (num_of_pages +1)*delay
+    int minimumTime = (int) ((urls.size() + 1) * 1000 * conf.getFloat(
+        "fetcher.server.delay", 5));
+    Assert.assertTrue(time > minimumTime);
+
+    // verify content
+    Path content = new Path(new Path(generatedSegment[0], Content.DIR_NAME),
+        "part-00000/data");
+    @SuppressWarnings("resource")
+    SequenceFile.Reader reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(content));
+
+    ArrayList<String> handledurls = new ArrayList<String>();
+
+    READ_CONTENT: do {
+      Text key = new Text();
+      Content value = new Content();
+      if (!reader.next(key, value))
+        break READ_CONTENT;
+      String contentString = new String(value.getContent());
+      if (contentString.indexOf("Nutch fetcher test page") != -1) {
+        handledurls.add(key.toString());
+      }
+    } while (true);
+
+    reader.close();
+
+    Collections.sort(urls);
+    Collections.sort(handledurls);
+
+    // verify that enough pages were handled
+    Assert.assertEquals(urls.size(), handledurls.size());
+
+    // verify that correct pages were handled
+    Assert.assertTrue(handledurls.containsAll(urls));
+    Assert.assertTrue(urls.containsAll(handledurls));
+
+    handledurls.clear();
+
+    // verify parse data
+    Path parseData = new Path(
+        new Path(generatedSegment[0], ParseData.DIR_NAME), "part-00000/data");
+    reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(parseData));
+
+    READ_PARSE_DATA: do {
+      Text key = new Text();
+      ParseData value = new ParseData();
+      if (!reader.next(key, value))
+        break READ_PARSE_DATA;
+      // make sure they all contain "nutch.segment.name" and
+      // "nutch.content.digest"
+      // keys in parse metadata
+      Metadata contentMeta = value.getContentMeta();
+      if (contentMeta.get(Nutch.SEGMENT_NAME_KEY) != null
+          && contentMeta.get(Nutch.SIGNATURE_KEY) != null) {
+        handledurls.add(key.toString());
+      }
+    } while (true);
+
+    Collections.sort(handledurls);
+
+    Assert.assertEquals(urls.size(), handledurls.size());
+
+    Assert.assertTrue(handledurls.containsAll(urls));
+    Assert.assertTrue(urls.containsAll(handledurls));
+  }
+
+  private void addUrl(ArrayList<String> urls, String page) {
+    urls.add("http://127.0.0.1:" + server.getConnectors()[0].getPort() + "/"
+        + page);
+  }
+
+  @Test
+  public void testAgentNameCheck() {
+
+    boolean failedNoAgentName = false;
+    conf.set("http.agent.name", "");
+
+    try {
+      conf.setBoolean("fetcher.parse", false);
+      Fetcher fetcher = new Fetcher(conf);
+      fetcher.fetch(null, 1);
+    } catch (IllegalArgumentException iae) {
+      String message = iae.getMessage();
+      failedNoAgentName = message.equals("Fetcher: No agents listed in "
+          + "'http.agent.name' property.");
+    } catch (Exception e) {
+    }
+
+    Assert.assertTrue(failedNoAgentName);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/indexer/TestIndexerMapReduce.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/indexer/TestIndexerMapReduce.java b/nutch-core/src/test/java/org/apache/nutch/indexer/TestIndexerMapReduce.java
new file mode 100644
index 0000000..3a25f26
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/indexer/TestIndexerMapReduce.java
@@ -0,0 +1,190 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.indexer;
+
+import org.apache.commons.codec.binary.Base64;
+import org.apache.hadoop.mrunit.ReduceDriver;
+import org.apache.hadoop.mrunit.types.Pair;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.NutchWritable;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.parse.ParseText;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.test.IntegrationTest;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.Reducer;
+
+import static org.junit.Assert.*;
+
+import java.io.IOException;
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+
+/** Test {@link IndexerMapReduce} */
+public class TestIndexerMapReduce {
+
+  private static final Logger LOG = LoggerFactory
+      .getLogger(TestIndexerMapReduce.class);
+
+  public static String testUrl = "http://nutch.apache.org/";
+  public static Text testUrlText = new Text(testUrl);
+  public static String htmlContentType = "text/html";
+  public static String testHtmlDoc = "<!DOCTYPE html>\n"
+      + "<html>\n"
+      + "<head>\n"
+      + "<title>Test Indexing Binary Content</title>\n"
+      + "<meta charset=\"utf-8\">\n"
+      + "<meta name=\"keywords\" lang=\"en\" content=\"charset, encoding\" />\n"
+      + "<meta name=\"keywords\" lang=\"fr\" content=\"codage des caract�res\" />\n"
+      + "<meta name=\"keywords\" lang=\"cs\" content=\"k�dov�n� znak\u016f\" />\n"
+      + "</head>\n"
+      + "<body>\n"
+      + "<p>\n"
+      + "<ul>\n"
+      + "  <li lang=\"en\">English: character set, encoding\n"
+      + "  <li lang=\"fr\">Fran�ais: codage des caract�res\n"
+      + "  <li lang=\"cs\">\u010ce\u0161tina: k�dov�n� znak\u016f (not covered by Latin-1)\n"
+      + "</ul>\n"
+      + "</body>\n"
+      + "</html>";
+  public static Metadata htmlMeta = new Metadata();
+  static {
+    htmlMeta.add("Content-Type", "text/html");
+    // add segment and signature to avoid NPEs
+    htmlMeta.add(Nutch.SEGMENT_NAME_KEY, "123");
+    htmlMeta.add(Nutch.SIGNATURE_KEY, "123");
+  }
+  public static ParseText parseText = new ParseText("Test");
+  public static ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS,
+      "Test", new Outlink[] {}, htmlMeta);
+  public static CrawlDatum crawlDatumDbFetched = new CrawlDatum(
+      CrawlDatum.STATUS_DB_FETCHED, 60 * 60 * 24);
+  public static CrawlDatum crawlDatumFetchSuccess = new CrawlDatum(
+      CrawlDatum.STATUS_FETCH_SUCCESS, 60 * 60 * 24);
+
+  private Reducer<Text, NutchWritable, Text, NutchIndexAction> reducer = new IndexerMapReduce();
+  private ReduceDriver<Text, NutchWritable, Text, NutchIndexAction> reduceDriver;
+  private Configuration configuration;
+
+
+  /**
+   * Test indexing of base64-encoded binary content.
+   */
+  @Test
+  @Category(IntegrationTest.class)
+  public void testBinaryContentBase64() {
+    configuration = NutchConfiguration.create();
+    configuration.setBoolean(IndexerMapReduce.INDEXER_BINARY_AS_BASE64, true);
+
+    Charset[] testCharsets = { StandardCharsets.UTF_8,
+        Charset.forName("iso-8859-1"), Charset.forName("iso-8859-2") };
+    for (Charset charset : testCharsets) {
+      LOG.info("Testing indexing binary content as base64 for charset {}",
+          charset.name());
+
+      String htmlDoc = testHtmlDoc;
+      if (charset != StandardCharsets.UTF_8) {
+        htmlDoc = htmlDoc.replaceAll("utf-8", charset.name());
+        if (charset.name().equalsIgnoreCase("iso-8859-1")) {
+          // Western-European character set: remove Czech content
+          htmlDoc = htmlDoc.replaceAll("\\s*<[^>]+\\slang=\"cs\".+?\\n", "");
+        } else if (charset.name().equalsIgnoreCase("iso-8859-2")) {
+          // Eastern-European character set: remove French content
+          htmlDoc = htmlDoc.replaceAll("\\s*<[^>]+\\slang=\"fr\".+?\\n", "");
+        }
+      }
+
+      Content content = new Content(testUrl, testUrl,
+          htmlDoc.getBytes(charset), htmlContentType, htmlMeta,
+          configuration);
+
+      NutchDocument doc = runIndexer(crawlDatumDbFetched,
+          crawlDatumFetchSuccess, parseText, parseData, content);
+      assertNotNull("No NutchDocument indexed", doc);
+
+      String binaryContentBase64 = (String) doc.getField("binaryContent")
+          .getValues().get(0);
+      LOG.info("binary content (base64): {}", binaryContentBase64);
+      String binaryContent = new String(
+          Base64.decodeBase64(binaryContentBase64), charset);
+      LOG.info("binary content (decoded): {}", binaryContent);
+      assertEquals(
+          "Binary content (" + charset + ") not correctly saved as base64",
+          htmlDoc, binaryContent);
+    }
+  }
+
+  /**
+   * Run {@link IndexerMapReduce.reduce(...)} to get a &quot;indexed&quot;
+   * {@link NutchDocument} by passing objects from segment and CrawlDb to the
+   * indexer.
+   *
+   * @param dbDatum
+   *          crawl datum from CrawlDb
+   * @param fetchDatum
+   *          crawl datum (fetch status) from segment
+   * @param parseText
+   *          plain text from parsed document
+   * @param parseData
+   *          parse data
+   * @param content
+   *          (optional, if index binary content) protocol content
+   * @return &quot;indexed&quot; document
+   */
+  public NutchDocument runIndexer(CrawlDatum dbDatum, CrawlDatum fetchDatum,
+      ParseText parseText, ParseData parseData, Content content) {
+    List<NutchWritable> values = new ArrayList<NutchWritable>();
+    values.add(new NutchWritable(dbDatum));
+    values.add(new NutchWritable(fetchDatum));
+    values.add(new NutchWritable(parseText));
+    values.add(new NutchWritable(parseData));
+    values.add(new NutchWritable(content));
+    reduceDriver = ReduceDriver.newReduceDriver(reducer);
+    reduceDriver.setConfiguration(configuration);
+    reduceDriver.withInput(testUrlText, values);
+    List<Pair<Text, NutchIndexAction>> reduceResult;
+    NutchDocument doc = null;
+    try {
+      reduceResult = reduceDriver.run();
+      for (Pair<Text, NutchIndexAction> p : reduceResult) {
+        if (p.getSecond().action != NutchIndexAction.DELETE) {
+          doc = p.getSecond().doc;
+        }
+      }
+    } catch (IOException e) {
+      LOG.error(StringUtils.stringifyException(e));
+    }
+    return doc;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/indexer/TestIndexingFilters.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/indexer/TestIndexingFilters.java b/nutch-core/src/test/java/org/apache/nutch/indexer/TestIndexingFilters.java
new file mode 100644
index 0000000..14b246b
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/indexer/TestIndexingFilters.java
@@ -0,0 +1,113 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.test.IntegrationTest;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+
+@Category(IntegrationTest.class)
+public class TestIndexingFilters {
+
+  /**
+   * Test behaviour when defined filter does not exist.
+   * 
+   * @throws IndexingException
+   */
+  @Test
+  public void testNonExistingIndexingFilter() throws IndexingException {
+    Configuration conf = NutchConfiguration.create();
+    conf.addResource("nutch-default.xml");
+    conf.addResource("crawl-tests.xml");
+
+    String class1 = "NonExistingFilter";
+    String class2 = "org.apache.nutch.indexer.basic.BasicIndexingFilter";
+    conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1 + " " + class2);
+
+    IndexingFilters filters = new IndexingFilters(conf);
+    filters.filter(new NutchDocument(), new ParseImpl("text", new ParseData(
+        new ParseStatus(), "title", new Outlink[0], new Metadata())), new Text(
+        "http://www.example.com/"), new CrawlDatum(), new Inlinks());
+  }
+
+  /**
+   * Test behaviour when NutchDOcument is null
+   */
+  @Test
+  public void testNutchDocumentNullIndexingFilter() throws IndexingException {
+    Configuration conf = NutchConfiguration.create();
+    conf.addResource("nutch-default.xml");
+    conf.addResource("crawl-tests.xml");
+
+    IndexingFilters filters = new IndexingFilters(conf);
+    NutchDocument doc = filters.filter(null, new ParseImpl("text",
+        new ParseData(new ParseStatus(), "title", new Outlink[0],
+            new Metadata())), new Text("http://www.example.com/"),
+        new CrawlDatum(), new Inlinks());
+
+    Assert.assertNull(doc);
+  }
+
+  /**
+   * Test behaviour when reset the index filter order will not take effect
+   * 
+   * @throws IndexingException
+   */
+  @Test
+  public void testFilterCacheIndexingFilter() throws IndexingException {
+    Configuration conf = NutchConfiguration.create();
+    conf.addResource("nutch-default.xml");
+    conf.addResource("crawl-tests.xml");
+
+    String class1 = "org.apache.nutch.indexer.basic.BasicIndexingFilter";
+    conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1);
+
+    IndexingFilters filters1 = new IndexingFilters(conf);
+    NutchDocument fdoc1 = filters1.filter(new NutchDocument(), new ParseImpl(
+        "text", new ParseData(new ParseStatus(), "title", new Outlink[0],
+            new Metadata())), new Text("http://www.example.com/"),
+        new CrawlDatum(), new Inlinks());
+
+    // add another index filter
+    String class2 = "org.apache.nutch.indexer.metadata.MetadataIndexer";
+    // set content metadata
+    Metadata md = new Metadata();
+    md.add("example", "data");
+    // set content metadata property defined in MetadataIndexer
+    conf.set("index.content.md", "example");
+    // add MetadataIndxer filter
+    conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1 + " " + class2);
+    IndexingFilters filters2 = new IndexingFilters(conf);
+    NutchDocument fdoc2 = filters2.filter(new NutchDocument(), new ParseImpl(
+        "text", new ParseData(new ParseStatus(), "title", new Outlink[0], md)),
+        new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
+    Assert.assertEquals(fdoc1.getFieldNames().size(), fdoc2.getFieldNames()
+        .size());
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/test/java/org/apache/nutch/metadata/TestMetadata.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/metadata/TestMetadata.java b/nutch-core/src/test/java/org/apache/nutch/metadata/TestMetadata.java
new file mode 100644
index 0000000..f3a320d
--- /dev/null
+++ b/nutch-core/src/test/java/org/apache/nutch/metadata/TestMetadata.java
@@ -0,0 +1,281 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.metadata;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.util.Properties;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+/**
+ * JUnit based tests of class {@link org.apache.nutch.metadata.Metadata}.
+ */
+public class TestMetadata {
+
+  private static final String CONTENTTYPE = "contenttype";
+
+  /**
+   * Test to ensure that only non-null values get written when the
+   * {@link Metadata} object is written using a Writeable.
+   * 
+   * @since NUTCH-406
+   * 
+   */
+  @Test
+  public void testWriteNonNull() {
+    Metadata met = new Metadata();
+    met.add(CONTENTTYPE, null);
+    met.add(CONTENTTYPE, "text/bogus");
+    met.add(CONTENTTYPE, "text/bogus2");
+    met = writeRead(met);
+
+    Assert.assertNotNull(met);
+    Assert.assertEquals(met.size(), 1);
+
+    boolean hasBogus = false, hasBogus2 = false;
+
+    String[] values = met.getValues(CONTENTTYPE);
+    Assert.assertNotNull(values);
+    Assert.assertEquals(values.length, 2);
+
+    for (int i = 0; i < values.length; i++) {
+      if (values[i].equals("text/bogus")) {
+        hasBogus = true;
+      }
+
+      if (values[i].equals("text/bogus2")) {
+        hasBogus2 = true;
+      }
+    }
+
+    Assert.assertTrue(hasBogus && hasBogus2);
+  }
+
+  /** Test for the <code>add(String, String)</code> method. */
+  @Test
+  public void testAdd() {
+    String[] values = null;
+    Metadata meta = new Metadata();
+
+    values = meta.getValues(CONTENTTYPE);
+    Assert.assertEquals(0, values.length);
+
+    meta.add(CONTENTTYPE, "value1");
+    values = meta.getValues(CONTENTTYPE);
+    Assert.assertEquals(1, values.length);
+    Assert.assertEquals("value1", values[0]);
+
+    meta.add(CONTENTTYPE, "value2");
+    values = meta.getValues(CONTENTTYPE);
+    Assert.assertEquals(2, values.length);
+    Assert.assertEquals("value1", values[0]);
+    Assert.assertEquals("value2", values[1]);
+
+    // NOTE : For now, the same value can be added many times.
+    // Should it be changed?
+    meta.add(CONTENTTYPE, "value1");
+    values = meta.getValues(CONTENTTYPE);
+    Assert.assertEquals(3, values.length);
+    Assert.assertEquals("value1", values[0]);
+    Assert.assertEquals("value2", values[1]);
+    Assert.assertEquals("value1", values[2]);
+  }
+
+  /** Test for the <code>set(String, String)</code> method. */
+  @Test
+  public void testSet() {
+    String[] values = null;
+    Metadata meta = new Metadata();
+
+    values = meta.getValues(CONTENTTYPE);
+    Assert.assertEquals(0, values.length);
+
+    meta.set(CONTENTTYPE, "value1");
+    values = meta.getValues(CONTENTTYPE);
+    Assert.assertEquals(1, values.length);
+    Assert.assertEquals("value1", values[0]);
+
+    meta.set(CONTENTTYPE, "value2");
+    values = meta.getValues(CONTENTTYPE);
+    Assert.assertEquals(1, values.length);
+    Assert.assertEquals("value2", values[0]);
+
+    meta.set(CONTENTTYPE, "new value 1");
+    meta.add("contenttype", "new value 2");
+    values = meta.getValues(CONTENTTYPE);
+    Assert.assertEquals(2, values.length);
+    Assert.assertEquals("new value 1", values[0]);
+    Assert.assertEquals("new value 2", values[1]);
+  }
+
+  /** Test for <code>setAll(Properties)</code> method. */
+  @Test
+  public void testSetProperties() {
+    String[] values = null;
+    Metadata meta = new Metadata();
+    Properties props = new Properties();
+
+    meta.setAll(props);
+    Assert.assertEquals(0, meta.size());
+
+    props.setProperty("name-one", "value1.1");
+    meta.setAll(props);
+    Assert.assertEquals(1, meta.size());
+    values = meta.getValues("name-one");
+    Assert.assertEquals(1, values.length);
+    Assert.assertEquals("value1.1", values[0]);
+
+    props.setProperty("name-two", "value2.1");
+    meta.setAll(props);
+    Assert.assertEquals(2, meta.size());
+    values = meta.getValues("name-one");
+    Assert.assertEquals(1, values.length);
+    Assert.assertEquals("value1.1", values[0]);
+    values = meta.getValues("name-two");
+    Assert.assertEquals(1, values.length);
+    Assert.assertEquals("value2.1", values[0]);
+  }
+
+  /** Test for <code>get(String)</code> method. */
+  @Test
+  public void testGet() {
+    Metadata meta = new Metadata();
+    Assert.assertNull(meta.get("a-name"));
+    meta.add("a-name", "value-1");
+    Assert.assertEquals("value-1", meta.get("a-name"));
+    meta.add("a-name", "value-2");
+    Assert.assertEquals("value-1", meta.get("a-name"));
+  }
+
+  /** Test for <code>isMultiValued()</code> method. */
+  @Test
+  public void testIsMultiValued() {
+    Metadata meta = new Metadata();
+    Assert.assertFalse(meta.isMultiValued("key"));
+    meta.add("key", "value1");
+    Assert.assertFalse(meta.isMultiValued("key"));
+    meta.add("key", "value2");
+    Assert.assertTrue(meta.isMultiValued("key"));
+  }
+
+  /** Test for <code>names</code> method. */
+  @Test
+  public void testNames() {
+    String[] names = null;
+    Metadata meta = new Metadata();
+    names = meta.names();
+    Assert.assertEquals(0, names.length);
+
+    meta.add("name-one", "value");
+    names = meta.names();
+    Assert.assertEquals(1, names.length);
+    Assert.assertEquals("name-one", names[0]);
+    meta.add("name-two", "value");
+    names = meta.names();
+    Assert.assertEquals(2, names.length);
+  }
+
+  /** Test for <code>remove(String)</code> method. */
+  @Test
+  public void testRemove() {
+    Metadata meta = new Metadata();
+    meta.remove("name-one");
+    Assert.assertEquals(0, meta.size());
+    meta.add("name-one", "value-1.1");
+    meta.add("name-one", "value-1.2");
+    meta.add("name-two", "value-2.2");
+    Assert.assertEquals(2, meta.size());
+    Assert.assertNotNull(meta.get("name-one"));
+    Assert.assertNotNull(meta.get("name-two"));
+    meta.remove("name-one");
+    Assert.assertEquals(1, meta.size());
+    Assert.assertNull(meta.get("name-one"));
+    Assert.assertNotNull(meta.get("name-two"));
+    meta.remove("name-two");
+    Assert.assertEquals(0, meta.size());
+    Assert.assertNull(meta.get("name-one"));
+    Assert.assertNull(meta.get("name-two"));
+  }
+
+  /** Test for <code>equals(Object)</code> method. */
+  @Test
+  public void testObject() {
+    Metadata meta1 = new Metadata();
+    Metadata meta2 = new Metadata();
+    Assert.assertFalse(meta1.equals(null));
+    Assert.assertFalse(meta1.equals("String"));
+    Assert.assertTrue(meta1.equals(meta2));
+    meta1.add("name-one", "value-1.1");
+    Assert.assertFalse(meta1.equals(meta2));
+    meta2.add("name-one", "value-1.1");
+    Assert.assertTrue(meta1.equals(meta2));
+    meta1.add("name-one", "value-1.2");
+    Assert.assertFalse(meta1.equals(meta2));
+    meta2.add("name-one", "value-1.2");
+    Assert.assertTrue(meta1.equals(meta2));
+    meta1.add("name-two", "value-2.1");
+    Assert.assertFalse(meta1.equals(meta2));
+    meta2.add("name-two", "value-2.1");
+    Assert.assertTrue(meta1.equals(meta2));
+    meta1.add("name-two", "value-2.2");
+    Assert.assertFalse(meta1.equals(meta2));
+    meta2.add("name-two", "value-2.x");
+    Assert.assertFalse(meta1.equals(meta2));
+  }
+
+  /** Test for <code>Writable</code> implementation. */
+  @Test
+  public void testWritable() {
+    Metadata result = null;
+    Metadata meta = new Metadata();
+    result = writeRead(meta);
+    Assert.assertEquals(0, result.size());
+    meta.add("name-one", "value-1.1");
+    result = writeRead(meta);
+    Assert.assertEquals(1, result.size());
+    Assert.assertEquals(1, result.getValues("name-one").length);
+    Assert.assertEquals("value-1.1", result.get("name-one"));
+    meta.add("name-two", "value-2.1");
+    meta.add("name-two", "value-2.2");
+    result = writeRead(meta);
+    Assert.assertEquals(2, result.size());
+    Assert.assertEquals(1, result.getValues("name-one").length);
+    Assert.assertEquals("value-1.1", result.getValues("name-one")[0]);
+    Assert.assertEquals(2, result.getValues("name-two").length);
+    Assert.assertEquals("value-2.1", result.getValues("name-two")[0]);
+    Assert.assertEquals("value-2.2", result.getValues("name-two")[1]);
+  }
+
+  private Metadata writeRead(Metadata meta) {
+    Metadata readed = new Metadata();
+    try {
+      ByteArrayOutputStream out = new ByteArrayOutputStream();
+      meta.write(new DataOutputStream(out));
+      readed.readFields(new DataInputStream(new ByteArrayInputStream(out
+          .toByteArray())));
+    } catch (IOException ioe) {
+      Assert.fail(ioe.toString());
+    }
+    return readed;
+  }
+
+}