You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by th...@apache.org on 2016/07/16 19:49:06 UTC

[50/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build for nutch-core and nutch-plugins

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/.gitignore
----------------------------------------------------------------------
diff --git a/.gitignore b/.gitignore
index 5b3c687..7a70f9d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,3 +6,9 @@ build/
 runtime/
 logs/
 /bin/
+
+*.class
+target/
+nutch-core/target
+nutch-plugins/target
+nutch-plugins/*/target
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/bin/crawl
----------------------------------------------------------------------
diff --git a/bin/crawl b/bin/crawl
new file mode 100755
index 0000000..567d35e
--- /dev/null
+++ b/bin/crawl
@@ -0,0 +1,281 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Usage: crawl [-i|--index] [-D "key=value"] [-w|--wait] <Seed Dir> <Crawl Dir> <Num Rounds>
+#    -i|--index      Indexes crawl results into a configured indexer
+#    -w|--wait       NUMBER[SUFFIX] Time to wait before generating a new segment when no URLs
+#                    are scheduled for fetching. Suffix can be: s for second,
+#                    m for minute, h for hour and d for day. If no suffix is
+#                    specified second is used by default.
+#    -D              A Java property to pass to Nutch calls
+#    Seed Dir        Directory in which to look for a seeds file
+#    Crawl Dir       Directory where the crawl/link/segments dirs are saved
+#    Num Rounds      The number of rounds to run this crawl for
+#
+#
+# UNLIKE THE NUTCH ALL-IN-ONE-CRAWL COMMAND THIS SCRIPT DOES THE LINK INVERSION AND
+# INDEXING FOR EACH SEGMENT
+
+INDEXFLAG=false
+JAVA_PROPERTIES=""
+WAIT=-1 # don't wait if there are no URLs to fetch
+
+function __to_seconds() {
+  NUMBER=$(echo $1 | tr -dc '0-9')
+  MODIFIER=$(echo $1 | tr -dc '[^s|h|m|d]]')
+
+  case $MODIFIER in
+      m|M)
+        SECONDS=`expr $NUMBER \* 60`
+        ;;
+      h|H)
+        SECONDS=`expr $NUMBER \* 120`
+        ;;
+      d|D)
+        SECONDS=`expr $NUMBER \* 86400`
+        ;;
+      s|S|*)
+        SECONDS=$NUMBER
+        ;;
+  esac
+
+  echo $SECONDS
+}
+
+while [[ $# > 0 ]]
+do
+    case $1 in
+        -i|--index)
+            INDEXFLAG=true
+            shift
+            ;;
+        -D)
+            JAVA_PROPERTIES="-D${2} ${JAVA_PROPERTIES}"
+            shift 2
+            ;;
+        -w|--wait)
+            WAIT="${2}"
+            shift 2
+            ;;
+        *)
+            break
+            ;;
+    esac
+done
+
+if [[ $# != 3 ]]; then
+    echo "Usage: crawl [-i|--index] [-D \"key=value\"] [-w|--wait] <Seed Dir> <Crawl Dir> <Num Rounds>"
+    echo -e "\t-i|--index\tIndexes crawl results into a configured indexer"
+    echo -e "\t-D\t\tA Java property to pass to Nutch calls"
+    echo -e "\t-w|--wait\tNUMBER[SUFFIX] Time to wait before generating a new segment when no URLs"
+    echo -e "\t\t\tare scheduled for fetching. Suffix can be: s for second,"
+    echo -e "\t\t\tm for minute, h for hour and d for day. If no suffix is"
+    echo -e "\t\t\tspecified second is used by default."
+    echo -e "\tSeed Dir\tDirectory in which to look for a seeds file"
+    echo -e "\tCrawl Dir\tDirectory where the crawl/link/segments dirs are saved"
+    echo -e "\tNum Rounds\tThe number of rounds to run this crawl for"
+    exit 1
+fi
+
+SEEDDIR="$1"
+CRAWL_PATH="$2"
+LIMIT="$3"
+
+# convert wait time to seconds for compatibility reasons
+if [ "$WAIT" != "-1" ]; then
+  WAIT=$( __to_seconds "$WAIT" )
+  echo "Time to wait (--wait) = $WAIT sec."
+fi
+
+#############################################
+# MODIFY THE PARAMETERS BELOW TO YOUR NEEDS #
+#############################################
+
+# set the number of slaves nodes
+numSlaves=1
+
+# and the total number of available tasks
+# sets Hadoop parameter "mapreduce.job.reduces"
+numTasks=`expr $numSlaves \* 2`
+
+# number of urls to fetch in one iteration
+# 250K per task?
+sizeFetchlist=`expr $numSlaves \* 50000`
+
+# time limit for feching
+timeLimitFetch=180
+
+# num threads for fetching
+numThreads=50
+
+#############################################
+
+bin="`dirname "$0"`"
+bin="`cd "$bin"; pwd`"
+
+# determines whether mode based on presence of job file
+mode=local
+if [ -f "${bin}"/../*nutch*.job ]; then
+    mode=distributed
+fi
+
+# note that some of the options listed here could be set in the
+# corresponding hadoop site xml param file
+commonOptions="-D mapreduce.job.reduces=$numTasks -D mapred.child.java.opts=-Xmx1000m -D mapreduce.reduce.speculative=false -D mapreduce.map.speculative=false -D mapreduce.map.output.compress=true"
+
+ # check that hadoop can be found on the path
+if [ $mode = "distributed" ]; then
+ if [ $(which hadoop | wc -l ) -eq 0 ]; then
+    echo "Can't find Hadoop executable. Add HADOOP_COMMON_HOME/bin to the path or run in local mode."
+    exit -1;
+ fi
+fi
+
+
+function __bin_nutch {
+    # run $bin/nutch, exit if exit value indicates error
+
+    echo "$bin/nutch $@" ;# echo command and arguments
+    "$bin/nutch" "$@"
+
+    RETCODE=$?
+    if [ $RETCODE -ne 0 ]
+    then
+        echo "Error running:"
+        echo "  $bin/nutch $@"
+        echo "Failed with exit value $RETCODE."
+        exit $RETCODE
+    fi
+}
+
+
+
+# initial injection
+echo "Injecting seed URLs"
+__bin_nutch inject "$CRAWL_PATH"/crawldb "$SEEDDIR"
+
+# main loop : rounds of generate - fetch - parse - update
+for ((a=1; ; a++))
+do
+  if [ -e ".STOP" ]
+  then
+   echo "STOP file found - escaping loop"
+   break
+  fi
+
+  if [ $LIMIT -ne -1 ]; then
+    if [ $a -gt $LIMIT ]; then
+      echo `date` ": Finished loop with $LIMIT iterations"
+      break
+    fi
+    echo `date` ": Iteration $a of $LIMIT"
+  else
+    echo `date` ": Iteration $a"
+  fi
+
+  echo "Generating a new segment"
+  generate_args=($commonOptions "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments -topN $sizeFetchlist -numFetchers $numSlaves -noFilter)
+  echo "$bin/nutch generate ${generate_args[@]}"
+  $bin/nutch generate "${generate_args[@]}"
+  RETCODE=$?
+  if [ $RETCODE -eq 0 ]; then
+      : # ok: no error
+  elif [ $RETCODE -eq 1 ]; then
+    echo "Generate returned 1 (no new segments created)"
+
+    if [ "$WAIT" -ne -1 ]; then
+      echo "Waiting for $WAIT sec. ..."
+      sleep $WAIT
+      continue
+    else
+      echo "Escaping loop: no more URLs to fetch now"
+      break
+    fi
+  else
+    echo "Error running:"
+    echo "  $bin/nutch generate ${generate_args[@]}"
+    echo "Failed with exit value $RETCODE."
+    exit $RETCODE
+  fi
+
+  # capture the name of the segment
+  # call hadoop in distributed mode
+  # or use ls
+
+  if [ $mode = "local" ]; then
+   SEGMENT=`ls "$CRAWL_PATH"/segments/ | sort -n | tail -n 1`
+  else
+   SEGMENT=`hadoop fs -ls "$CRAWL_PATH"/segments/ | grep segments |  sed -e "s/\//\\n/g" | egrep 20[0-9]+ | sort -n | tail -n 1`
+  fi
+
+  echo "Operating on segment : $SEGMENT"
+
+  # fetching the segment
+  echo "Fetching : $SEGMENT"
+  __bin_nutch fetch $commonOptions -D fetcher.timelimit.mins=$timeLimitFetch "$CRAWL_PATH"/segments/$SEGMENT -noParsing -threads $numThreads
+
+  # parsing the segment
+  echo "Parsing : $SEGMENT"
+  # enable the skipping of records for the parsing so that a dodgy document
+  # so that it does not fail the full task
+  skipRecordsOptions="-D mapreduce.task.skip.start.attempts=2 -D mapreduce.map.skip.maxrecords=1"
+  __bin_nutch parse $commonOptions $skipRecordsOptions "$CRAWL_PATH"/segments/$SEGMENT
+
+  # updatedb with this segment
+  echo "CrawlDB update"
+  __bin_nutch updatedb $commonOptions "$CRAWL_PATH"/crawldb  "$CRAWL_PATH"/segments/$SEGMENT
+
+# note that the link inversion - indexing routine can be done within the main loop
+# on a per segment basis
+  echo "Link inversion"
+  __bin_nutch invertlinks "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT
+
+  echo "Dedup on crawldb"
+  __bin_nutch dedup "$CRAWL_PATH"/crawldb
+
+  if $INDEXFLAG; then
+      echo "Indexing $SEGMENT to index"
+      __bin_nutch index $JAVA_PROPERTIES "$CRAWL_PATH"/crawldb -linkdb "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT
+
+      echo "Cleaning up index if possible"
+      __bin_nutch clean $JAVA_PROPERTIES "$CRAWL_PATH"/crawldb
+  else
+      echo "Skipping indexing ..."
+  fi
+
+  #######################################################
+  # The following commands fall into WebGraph territory
+  # and should be uncommented based on your requirements
+  #######################################################
+  #echo "Building WebGraph within $CRAWL_PATH on all segments in $CRAWL_PATH/segments/"
+  #__bin_nutch webgraph $commonOptions -filter -normalize -segmentDir "$CRAWL_PATH"/segments/ -webgraphdb "$CRAWL_PATH"
+
+  #echo "Running Loops Job on WebGraph within $CRAWL_PATH"
+  #__bin_nutch org.apache.nutch.scoring.webgraph.Loops $commonOptions -webgraphdb "$CRAWL_PATH"
+
+  #echo "Running LinkRank Algorithm on WebGraph within $CRAWL_PATH"
+  #__bin_nutch linkrank $commonOptions -webgraphdb "$CRAWL_PATH"
+
+  #echo "Running ScoreUpdater Job with $CRAWL_PATH/crawldb and  WebGraph within $CRAWL_PATH"
+  #__bin_nutch scoreupdater $commonOptions -crawldb "$CRAWL_PATH"/crawldb -webgraphdb "$CRAWL_PATH"
+
+  #echo "Running NodeDumper on WebGraph within $CRAWL_PATH and dumping output to $CRAWL_PATH/dump/scores"
+  #__bin_nutch nodedumper $commonOptions -scores -topn 1000 -webgraphdb "$CRAWL_PATH" -output "$CRAWL_PATH"/dump/scores
+
+done
+
+exit 0

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/bin/nutch
----------------------------------------------------------------------
diff --git a/bin/nutch b/bin/nutch
new file mode 100755
index 0000000..1649069
--- /dev/null
+++ b/bin/nutch
@@ -0,0 +1,324 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# 
+# The Nutch command script
+#
+# Environment Variables
+#
+#   NUTCH_JAVA_HOME The java implementation to use.  Overrides JAVA_HOME.
+#
+#   NUTCH_HEAPSIZE  The maximum amount of heap to use, in MB. 
+#                   Default is 1000.
+#
+#   NUTCH_OPTS      Extra Java runtime options.
+#                   Multiple options must be separated by white space.
+#
+#   NUTCH_LOG_DIR   Log directory (default: $NUTCH_HOME/logs)
+#
+#   NUTCH_LOGFILE   Log file (default: hadoop.log)
+#
+#   NUTCH_CONF_DIR  Path(s) to configuration files (default: $NUTCH_HOME/conf).
+#                   Multiple paths must be separated by a colon ':'.
+#
+cygwin=false
+case "`uname`" in
+CYGWIN*) cygwin=true;;
+esac
+
+# resolve links - $0 may be a softlink
+THIS="$0"
+while [ -h "$THIS" ]; do
+  ls=`ls -ld "$THIS"`
+  link=`expr "$ls" : '.*-> \(.*\)$'`
+  if expr "$link" : '.*/.*' > /dev/null; then
+    THIS="$link"
+  else
+    THIS=`dirname "$THIS"`/"$link"
+  fi
+done
+
+# if no args specified, show usage
+if [ $# = 0 ]; then
+  echo "nutch 1.12"
+  echo "Usage: nutch COMMAND"
+  echo "where COMMAND is one of:"
+  echo "  readdb            read / dump crawl db"
+  echo "  mergedb           merge crawldb-s, with optional filtering"
+  echo "  readlinkdb        read / dump link db"
+  echo "  inject            inject new urls into the database"
+  echo "  generate          generate new segments to fetch from crawl db"
+  echo "  freegen           generate new segments to fetch from text files"
+  echo "  fetch             fetch a segment's pages"
+  echo "  parse             parse a segment's pages"
+  echo "  readseg           read / dump segment data"
+  echo "  mergesegs         merge several segments, with optional filtering and slicing"
+  echo "  updatedb          update crawl db from segments after fetching"
+  echo "  invertlinks       create a linkdb from parsed segments"
+  echo "  mergelinkdb       merge linkdb-s, with optional filtering"
+  echo "  index             run the plugin-based indexer on parsed segments and linkdb"
+  echo "  dedup             deduplicate entries in the crawldb and give them a special status"
+  echo "  dump              exports crawled data from segments into files"
+  echo "  commoncrawldump   exports crawled data from segments into common crawl data format encoded as CBOR"
+  echo "  solrindex         run the solr indexer on parsed segments and linkdb - DEPRECATED use the index command instead"
+  echo "  solrdedup         remove duplicates from solr - DEPRECATED use the dedup command instead"
+  echo "  solrclean         remove HTTP 301 and 404 documents from solr - DEPRECATED use the clean command instead"
+  echo "  clean             remove HTTP 301 and 404 documents and duplicates from indexing backends configured via plugins"
+  echo "  parsechecker      check the parser for a given url"
+  echo "  indexchecker      check the indexing filters for a given url"
+  echo "  filterchecker     check url filters for a given url"
+  echo "  normalizerchecker check url normalizers for a given url"
+  echo "  domainstats       calculate domain statistics from crawldb"
+  echo "  protocolstats     calculate protocol status code stats from crawldb"
+  echo "  crawlcomplete     calculate crawl completion stats from crawldb"
+  echo "  webgraph          generate a web graph from existing segments"
+  echo "  linkrank          run a link analysis program on the generated web graph"
+  echo "  scoreupdater      updates the crawldb with linkrank scores"
+  echo "  nodedumper        dumps the web graph's node scores"
+  echo "  plugin            load a plugin and run one of its classes main()"
+  echo "  junit             runs the given JUnit test"
+  echo "  startserver       runs the Nutch Server on localhost:8081"
+  echo "  webapp            run a local Nutch Web Application on locahost:8080"
+  echo "  warc              exports crawled data from segments at the WARC format"
+  echo "  updatehostdb      update the host db with records from the crawl db"
+  echo "  readhostdb        read / dump host db"
+  echo " or"
+  echo "  CLASSNAME         run the class named CLASSNAME"
+  echo "Most commands print help when invoked w/o parameters."
+  exit 1
+fi
+
+# get arguments
+COMMAND=$1
+shift
+
+# some directories
+THIS_DIR="`dirname "$THIS"`"
+NUTCH_HOME="`cd "$THIS_DIR/.." ; pwd`"
+
+# some Java parameters
+if [ "$NUTCH_JAVA_HOME" != "" ]; then
+  #echo "run java in $NUTCH_JAVA_HOME"
+  JAVA_HOME="$NUTCH_JAVA_HOME"
+fi
+  
+if [ "$JAVA_HOME" = "" ]; then
+  echo "Error: JAVA_HOME is not set."
+  exit 1
+fi
+
+local=true
+
+# NUTCH_JOB 
+if [ -f "${NUTCH_HOME}"/*nutch*.job ]; then
+  local=false
+  for f in "$NUTCH_HOME"/*nutch*.job; do
+    NUTCH_JOB="$f"
+  done
+  # cygwin path translation
+  if $cygwin; then
+	NUTCH_JOB="`cygpath -p -w "$NUTCH_JOB"`"
+  fi
+fi
+
+JAVA="$JAVA_HOME/bin/java"
+JAVA_HEAP_MAX=-Xmx1000m 
+
+# check envvars which might override default args
+if [ "$NUTCH_HEAPSIZE" != "" ]; then
+  #echo "run with heapsize $NUTCH_HEAPSIZE"
+  JAVA_HEAP_MAX="-Xmx""$NUTCH_HEAPSIZE""m"
+  #echo $JAVA_HEAP_MAX
+fi
+
+# CLASSPATH initially contains $NUTCH_CONF_DIR, or defaults to $NUTCH_HOME/conf
+CLASSPATH="${NUTCH_CONF_DIR:=$NUTCH_HOME/conf}"
+CLASSPATH="${CLASSPATH}:$JAVA_HOME/lib/tools.jar"
+
+# so that filenames w/ spaces are handled correctly in loops below
+IFS=
+
+# add libs to CLASSPATH
+if $local; then
+  for f in "$NUTCH_HOME"/lib/*.jar; do
+   CLASSPATH="${CLASSPATH}:$f";
+  done
+  # local runtime
+  # add plugins to classpath
+  if [ -d "$NUTCH_HOME/plugins" ]; then
+     CLASSPATH="${NUTCH_HOME}:${CLASSPATH}"
+  fi
+fi
+
+# cygwin path translation
+if $cygwin; then
+  CLASSPATH="`cygpath -p -w "$CLASSPATH"`"
+fi
+
+# setup 'java.library.path' for native-hadoop code if necessary
+# used only in local mode 
+JAVA_LIBRARY_PATH=''
+if [ -d "${NUTCH_HOME}/lib/native" ]; then
+
+  JAVA_PLATFORM=`"${JAVA}" -classpath "$CLASSPATH" org.apache.hadoop.util.PlatformName | sed -e 's/ /_/g'`
+
+  if [ -d "${NUTCH_HOME}/lib/native" ]; then
+    if [ "x$JAVA_LIBRARY_PATH" != "x" ]; then
+      JAVA_LIBRARY_PATH="${JAVA_LIBRARY_PATH}:${NUTCH_HOME}/lib/native/${JAVA_PLATFORM}"
+    else
+      JAVA_LIBRARY_PATH="${NUTCH_HOME}/lib/native/${JAVA_PLATFORM}"
+    fi
+  fi
+fi
+
+if [ $cygwin = true -a "X${JAVA_LIBRARY_PATH}" != "X" ]; then
+  JAVA_LIBRARY_PATH="`cygpath -p -w "$JAVA_LIBRARY_PATH"`"
+fi
+
+# restore ordinary behaviour
+unset IFS
+
+# default log directory & file
+if [ "$NUTCH_LOG_DIR" = "" ]; then
+  NUTCH_LOG_DIR="$NUTCH_HOME/logs"
+fi
+if [ "$NUTCH_LOGFILE" = "" ]; then
+  NUTCH_LOGFILE='hadoop.log'
+fi
+
+#Fix log path under cygwin
+if $cygwin; then
+  NUTCH_LOG_DIR="`cygpath -p -w "$NUTCH_LOG_DIR"`"
+fi
+
+NUTCH_OPTS=($NUTCH_OPTS -Dhadoop.log.dir="$NUTCH_LOG_DIR")
+NUTCH_OPTS=("${NUTCH_OPTS[@]}" -Dhadoop.log.file="$NUTCH_LOGFILE")
+
+if [ "x$JAVA_LIBRARY_PATH" != "x" ]; then
+  NUTCH_OPTS=("${NUTCH_OPTS[@]}" -Djava.library.path="$JAVA_LIBRARY_PATH")
+fi
+
+# figure out which class to run
+if [ "$COMMAND" = "crawl" ] ; then
+  echo "Command $COMMAND is deprecated, please use bin/crawl instead"
+  exit -1
+elif [ "$COMMAND" = "inject" ] ; then
+  CLASS=org.apache.nutch.crawl.Injector
+elif [ "$COMMAND" = "generate" ] ; then
+  CLASS=org.apache.nutch.crawl.Generator
+elif [ "$COMMAND" = "freegen" ] ; then
+  CLASS=org.apache.nutch.tools.FreeGenerator
+elif [ "$COMMAND" = "fetch" ] ; then
+  CLASS=org.apache.nutch.fetcher.Fetcher
+elif [ "$COMMAND" = "parse" ] ; then
+  CLASS=org.apache.nutch.parse.ParseSegment
+elif [ "$COMMAND" = "readdb" ] ; then
+  CLASS=org.apache.nutch.crawl.CrawlDbReader
+elif [ "$COMMAND" = "mergedb" ] ; then
+  CLASS=org.apache.nutch.crawl.CrawlDbMerger
+elif [ "$COMMAND" = "readlinkdb" ] ; then
+  CLASS=org.apache.nutch.crawl.LinkDbReader
+elif [ "$COMMAND" = "readseg" ] ; then
+  CLASS=org.apache.nutch.segment.SegmentReader
+elif [ "$COMMAND" = "mergesegs" ] ; then
+  CLASS=org.apache.nutch.segment.SegmentMerger
+elif [ "$COMMAND" = "updatedb" ] ; then
+  CLASS=org.apache.nutch.crawl.CrawlDb
+elif [ "$COMMAND" = "invertlinks" ] ; then
+  CLASS=org.apache.nutch.crawl.LinkDb
+elif [ "$COMMAND" = "mergelinkdb" ] ; then
+  CLASS=org.apache.nutch.crawl.LinkDbMerger
+elif [ "$COMMAND" = "dump" ] ; then
+  CLASS=org.apache.nutch.tools.FileDumper
+elif [ "$COMMAND" = "commoncrawldump" ] ; then
+  CLASS=org.apache.nutch.tools.CommonCrawlDataDumper
+elif [ "$COMMAND" = "solrindex" ] ; then
+  CLASS="org.apache.nutch.indexer.IndexingJob -D solr.server.url=$1"
+  shift
+elif [ "$COMMAND" = "index" ] ; then
+  CLASS=org.apache.nutch.indexer.IndexingJob
+elif [ "$COMMAND" = "solrdedup" ] ; then
+  echo "Command $COMMAND is deprecated, please use dedup instead"
+  exit -1
+elif [ "$COMMAND" = "dedup" ] ; then
+  CLASS=org.apache.nutch.crawl.DeduplicationJob
+elif [ "$COMMAND" = "solrclean" ] ; then
+  CLASS="org.apache.nutch.indexer.CleaningJob -D solr.server.url=$2 $1"
+  shift; shift
+elif [ "$COMMAND" = "clean" ] ; then
+  CLASS=org.apache.nutch.indexer.CleaningJob
+elif [ "$COMMAND" = "parsechecker" ] ; then
+  CLASS=org.apache.nutch.parse.ParserChecker
+elif [ "$COMMAND" = "indexchecker" ] ; then
+  CLASS=org.apache.nutch.indexer.IndexingFiltersChecker
+elif [ "$COMMAND" = "filterchecker" ] ; then
+  CLASS=org.apache.nutch.net.URLFilterChecker
+elif [ "$COMMAND" = "normalizerchecker" ] ; then
+  CLASS=org.apache.nutch.net.URLNormalizerChecker
+elif [ "$COMMAND" = "domainstats" ] ; then 
+  CLASS=org.apache.nutch.util.domain.DomainStatistics
+elif [ "$COMMAND" = "protocolstats" ] ; then
+   CLASS=org.apache.nutch.util.ProtocolStatusStatistics
+elif [ "$COMMAND" = "crawlcomplete" ] ; then
+  CLASS=org.apache.nutch.util.CrawlCompletionStats
+elif [ "$COMMAND" = "webgraph" ] ; then
+  CLASS=org.apache.nutch.scoring.webgraph.WebGraph
+elif [ "$COMMAND" = "linkrank" ] ; then
+  CLASS=org.apache.nutch.scoring.webgraph.LinkRank
+elif [ "$COMMAND" = "scoreupdater" ] ; then
+  CLASS=org.apache.nutch.scoring.webgraph.ScoreUpdater
+elif [ "$COMMAND" = "nodedumper" ] ; then
+  CLASS=org.apache.nutch.scoring.webgraph.NodeDumper
+elif [ "$COMMAND" = "plugin" ] ; then
+  CLASS=org.apache.nutch.plugin.PluginRepository
+elif [ "$COMMAND" = "junit" ] ; then
+  CLASSPATH="$CLASSPATH:$NUTCH_HOME/test/classes/"
+  if $local; then
+    for f in "$NUTCH_HOME"/test/lib/*.jar; do
+      CLASSPATH="${CLASSPATH}:$f";
+    done
+  fi
+  CLASS=org.junit.runner.JUnitCore
+elif [ "$COMMAND" = "startserver" ] ; then
+  CLASS=org.apache.nutch.service.NutchServer
+elif [ "$COMMAND" = "webapp" ] ; then
+  CLASS=org.apache.nutch.webui.NutchUiServer
+elif [ "$COMMAND" = "warc" ] ; then
+  CLASS=org.apache.nutch.tools.warc.WARCExporter
+elif [ "$COMMAND" = "updatehostdb" ] ; then
+  CLASS=org.apache.nutch.hostdb.UpdateHostDb
+elif [ "$COMMAND" = "readhostdb" ] ; then
+  CLASS=org.apache.nutch.hostdb.ReadHostDb
+else
+  CLASS=$COMMAND
+fi
+
+# distributed mode
+EXEC_CALL=(hadoop jar "$NUTCH_JOB")
+
+if $local; then
+ EXEC_CALL=("$JAVA" $JAVA_HEAP_MAX "${NUTCH_OPTS[@]}" -classpath "$CLASSPATH")
+else
+ # check that hadoop can be found on the path
+ if [ $(which hadoop | wc -l ) -eq 0 ]; then
+    echo "Can't find Hadoop executable. Add HADOOP_COMMON_HOME/bin to the path or run in local mode."
+    exit -1;
+ fi
+fi
+
+# run it
+exec "${EXEC_CALL[@]}" $CLASS "$@"
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-core/pom.xml b/nutch-core/pom.xml
new file mode 100644
index 0000000..62e2e58
--- /dev/null
+++ b/nutch-core/pom.xml
@@ -0,0 +1,522 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-parent</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>nutch-core</artifactId>
+    <packaging>jar</packaging>
+
+    <name>Apache Nutch</name>
+    <description>Nutch is an open source web-search software.
+        It builds on Hadoop, Tika and Solr, adding web-specifics,
+        such as a crawler, a link-graph database etc.
+    </description>
+    <url>http://nutch.apache.org</url>
+    <licenses>
+    <license>
+      <name>The Apache Software License, Version 2.0</name>
+      <url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
+      <distribution>repo</distribution>
+    </license>
+  </licenses>
+
+    <scm>
+    <developerConnection>scm:git:https://git-wip-us.apache.org/repos/asf/nutch.git</developerConnection>
+    <connection>scm:git:http://git-wip-us.apache.org/repos/asf/nutch.git</connection>
+    <url>https://git-wip-us.apache.org/repos/asf/nutch.git</url>
+  </scm>
+
+  <pluginRepositories>
+    <pluginRepository>
+      <id>miredot</id>
+      <name>MireDot Releases</name>
+      <url>http://nexus.qmino.com/content/repositories/miredot</url>
+    </pluginRepository>
+  </pluginRepositories>
+
+  <developers>
+    <developer>
+      <id>mattmann</id>
+      <name>Chris A. Mattmann</name>
+      <email>mattmann@apache.org</email>
+    </developer>
+    <developer>
+      <id>jnioche</id>
+      <name>Julien Nioche</name>
+      <email>jnioche@apache.org</email>
+    </developer>
+    <developer>
+      <id>lewismc</id>
+      <name>Lewis John McGibbney</name>
+      <email>lewismc@apache.org</email>
+    </developer>
+    <developer>
+      <id>markus</id>
+      <name>Markus Jelsma</name>
+      <email>markus@apache.org</email>
+    </developer>
+    <developer>
+      <id>fenglu</id>
+      <name>Feng Lu</name>
+      <email>fenglu@apache.org</email>
+    </developer>
+    <developer>
+      <id>kiranch</id>
+      <name>Kiran Chitturi</name>
+      <email>kiranch@apache.org</email>
+    </developer>
+    <developer>
+      <id>tejasp</id>
+      <name>Tejas Patil</name>
+      <email>tejasp@apache.org</email>
+    </developer>
+    <developer>
+      <id>talat</id>
+      <name>Talat Uyarer</name>
+      <email>talat@apache.org</email>
+    </developer>
+    <developer>
+      <id>snagel</id>
+      <name>Sebastian Nagel</name>
+      <email>snagel@apache.org</email>
+    </developer>
+    <developer>
+      <id>thammegowda</id>
+      <name>Thamme Gowda</name>
+      <email>thammegowda@apache.org</email>
+    </developer>
+  </developers>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+        <slf4j.version>1.7.12</slf4j.version>
+        <junit.version>4.12</junit.version>
+        <dir.root>${project.parent.basedir}</dir.root>
+        <libs.dir>${dir.local}${file.separator}lib</libs.dir>
+    </properties>
+
+    <dependencies>
+        <dependency>
+            <groupId>org.slf4j</groupId>
+            <artifactId>slf4j-api</artifactId>
+            <version>${slf4j.version}</version>
+            <optional>true</optional>
+        </dependency>
+        <dependency>
+            <groupId>org.slf4j</groupId>
+            <artifactId>slf4j-log4j12</artifactId>
+            <version>${slf4j.version}</version>
+            <optional>true</optional>
+        </dependency>
+        <dependency>
+            <groupId>commons-lang</groupId>
+            <artifactId>commons-lang</artifactId>
+            <version>2.6</version>
+            <optional>true</optional>
+        </dependency>
+        <dependency>
+            <groupId>commons-collections</groupId>
+            <artifactId>commons-collections</artifactId>
+            <version>3.2.1</version>
+            <optional>true</optional>
+        </dependency>
+        <dependency>
+            <groupId>commons-httpclient</groupId>
+            <artifactId>commons-httpclient</artifactId>
+            <version>3.1</version>
+            <optional>true</optional>
+        </dependency>
+        <dependency>
+            <groupId>commons-codec</groupId>
+            <artifactId>commons-codec</artifactId>
+            <version>1.10</version>
+            <optional>true</optional>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.commons</groupId>
+            <artifactId>commons-compress</artifactId>
+            <version>1.9</version>
+            <optional>true</optional>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.commons</groupId>
+            <artifactId>commons-jexl</artifactId>
+            <version>2.1.1</version>
+            <scope>compile</scope>
+        </dependency>
+        <dependency>
+            <groupId>com.tdunning</groupId>
+            <artifactId>t-digest</artifactId>
+            <version>3.1</version>
+            <scope>compile</scope>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.hadoop</groupId>
+            <artifactId>hadoop-common</artifactId>
+            <version>2.4.0</version>
+            <optional>true</optional>
+            <exclusions>
+                <exclusion>
+                    <groupId>hsqldb</groupId>
+                    <artifactId>*</artifactId>
+                </exclusion>
+                <exclusion>
+                    <groupId>net.sf.kosmosfs</groupId>
+                    <artifactId>*</artifactId>
+                </exclusion>
+                <exclusion>
+                    <groupId>net.java.dev.jets3t</groupId>
+                    <artifactId>*</artifactId>
+                </exclusion>
+                <exclusion>
+                    <groupId>org.eclipse.jdt</groupId>
+                    <artifactId>*</artifactId>
+                </exclusion>
+                <exclusion>
+                    <groupId>org.mortbay.jetty</groupId>
+                    <artifactId>*</artifactId>
+                </exclusion>
+                <exclusion>
+                    <groupId>ant</groupId>
+                    <artifactId>*</artifactId>
+                </exclusion>
+            </exclusions>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.hadoop</groupId>
+            <artifactId>hadoop-hdfs</artifactId>
+            <version>2.4.0</version>
+            <optional>true</optional>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.hadoop</groupId>
+            <artifactId>hadoop-mapreduce-client-core</artifactId>
+            <version>2.4.0</version>
+            <optional>true</optional>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.hadoop</groupId>
+            <artifactId>hadoop-mapreduce-client-jobclient</artifactId>
+            <version>2.4.0</version>
+            <optional>true</optional>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.tika</groupId>
+            <artifactId>tika-core</artifactId>
+            <version>1.12</version>
+            <scope>compile</scope>
+        </dependency>
+        <dependency>
+            <groupId>com.ibm.icu</groupId>
+            <artifactId>icu4j</artifactId>
+            <version>55.1</version>
+            <scope>compile</scope>
+        </dependency>
+        <dependency>
+            <groupId>xerces</groupId>
+            <artifactId>xercesImpl</artifactId>
+            <version>2.11.0</version>
+            <scope>compile</scope>
+        </dependency>
+        <dependency>
+            <groupId>xerces</groupId>
+            <artifactId>xmlParserAPIs</artifactId>
+            <version>2.6.2</version>
+            <scope>compile</scope>
+        </dependency>
+        <dependency>
+            <groupId>oro</groupId>
+            <artifactId>oro</artifactId>
+            <version>2.0.8</version>
+            <scope>compile</scope>
+        </dependency>
+        <dependency>
+            <groupId>com.google.guava</groupId>
+            <artifactId>guava</artifactId>
+            <version>16.0.1</version>
+            <scope>compile</scope>
+        </dependency>
+        <dependency>
+            <groupId>com.github.crawler-commons</groupId>
+            <artifactId>crawler-commons</artifactId>
+            <version>0.6</version>
+            <scope>compile</scope>
+        </dependency>
+        <dependency>
+            <groupId>com.martinkl.warc</groupId>
+            <artifactId>warc-hadoop</artifactId>
+            <version>0.1.0</version>
+            <scope>compile</scope>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.cxf</groupId>
+            <artifactId>cxf-rt-frontend-jaxws</artifactId>
+            <version>3.0.4</version>
+            <optional>true</optional>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.cxf</groupId>
+            <artifactId>cxf-rt-frontend-jaxrs</artifactId>
+            <version>3.0.4</version>
+            <optional>true</optional>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.cxf</groupId>
+            <artifactId>cxf-rt-transports-http</artifactId>
+            <version>3.0.4</version>
+            <optional>true</optional>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.cxf</groupId>
+            <artifactId>cxf-rt-transports-http-jetty</artifactId>
+            <version>3.0.4</version>
+            <optional>true</optional>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.cxf</groupId>
+            <artifactId>cxf-rt-rs-client</artifactId>
+            <version>3.0.4</version>
+            <optional>true</optional>
+        </dependency>
+        <dependency>
+            <groupId>com.fasterxml.jackson.core</groupId>
+            <artifactId>jackson-databind</artifactId>
+            <version>2.5.1</version>
+            <optional>true</optional>
+        </dependency>
+        <dependency>
+            <groupId>com.fasterxml.jackson.dataformat</groupId>
+            <artifactId>jackson-dataformat-cbor</artifactId>
+            <version>2.5.1</version>
+            <optional>true</optional>
+        </dependency>
+        <dependency>
+            <groupId>com.fasterxml.jackson.jaxrs</groupId>
+            <artifactId>jackson-jaxrs-json-provider</artifactId>
+            <version>2.5.1</version>
+            <optional>true</optional>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.lucene</groupId>
+            <artifactId>lucene-analyzers-common</artifactId>
+            <version>4.10.2</version>
+            <optional>true</optional>
+        </dependency>
+        <dependency>
+            <groupId>org.netpreserve.commons</groupId>
+            <artifactId>webarchive-commons</artifactId>
+            <version>1.1.5</version>
+            <optional>true</optional>
+            <exclusions>
+                <exclusion>
+                    <groupId>*</groupId>
+                    <artifactId>hadoop-core</artifactId>
+                </exclusion>
+                <exclusion>
+                    <groupId>com.google.guava</groupId>
+                    <artifactId>*</artifactId>
+                </exclusion>
+                <exclusion>
+                    <groupId>junit</groupId>
+                    <artifactId>*</artifactId>
+                </exclusion>
+            </exclusions>
+        </dependency>
+        <dependency>
+            <groupId>junit</groupId>
+            <artifactId>junit</artifactId>
+            <version>4.11</version>
+            <optional>true</optional>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.mrunit</groupId>
+            <artifactId>mrunit</artifactId>
+            <version>1.1.0</version>
+            <classifier>hadoop2</classifier>
+            <optional>true</optional>
+            <exclusions>
+                <exclusion>
+                    <groupId>log4j</groupId>
+                    <artifactId>log4j</artifactId>
+                </exclusion>
+            </exclusions>
+        </dependency>
+        <dependency>
+            <groupId>org.mortbay.jetty</groupId>
+            <artifactId>jetty-client</artifactId>
+            <version>6.1.22</version>
+            <optional>true</optional>
+        </dependency>
+        <dependency>
+            <groupId>org.mortbay.jetty</groupId>
+            <artifactId>jetty</artifactId>
+            <version>6.1.22</version>
+            <optional>true</optional>
+        </dependency>
+        <dependency>
+            <groupId>org.mortbay.jetty</groupId>
+            <artifactId>jetty-util</artifactId>
+            <version>6.1.22</version>
+            <optional>true</optional>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.commons</groupId>
+            <artifactId>commons-collections4</artifactId>
+            <version>4.0</version>
+            <optional>true</optional>
+        </dependency>
+        <dependency>
+            <groupId>org.springframework</groupId>
+            <artifactId>spring-core</artifactId>
+            <version>4.0.4.RELEASE</version>
+            <optional>true</optional>
+        </dependency>
+        <dependency>
+            <groupId>org.springframework</groupId>
+            <artifactId>spring-context</artifactId>
+            <version>4.0.4.RELEASE</version>
+            <optional>true</optional>
+        </dependency>
+        <dependency>
+            <groupId>org.springframework</groupId>
+            <artifactId>spring-web</artifactId>
+            <version>4.0.4.RELEASE</version>
+            <optional>true</optional>
+        </dependency>
+        <dependency>
+            <groupId>com.sun.jersey</groupId>
+            <artifactId>jersey-client</artifactId>
+            <version>1.8</version>
+            <optional>true</optional>
+        </dependency>
+        <dependency>
+            <groupId>com.j256.ormlite</groupId>
+            <artifactId>ormlite-jdbc</artifactId>
+            <version>4.48</version>
+            <optional>true</optional>
+        </dependency>
+        <dependency>
+            <groupId>com.h2database</groupId>
+            <artifactId>h2</artifactId>
+            <version>1.4.180</version>
+            <optional>true</optional>
+        </dependency>
+        <dependency>
+            <groupId>org.eclipse.persistence</groupId>
+            <artifactId>javax.persistence</artifactId>
+            <version>2.0.0</version>
+            <optional>true</optional>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.wicket</groupId>
+            <artifactId>wicket-core</artifactId>
+            <version>6.16.0</version>
+            <optional>true</optional>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.wicket</groupId>
+            <artifactId>wicket-spring</artifactId>
+            <version>6.16.0</version>
+            <optional>true</optional>
+        </dependency>
+        <dependency>
+            <groupId>de.agilecoders.wicket</groupId>
+            <artifactId>wicket-bootstrap-core</artifactId>
+            <version>0.9.2</version>
+            <optional>true</optional>
+        </dependency>
+        <dependency>
+            <groupId>de.agilecoders.wicket</groupId>
+            <artifactId>wicket-bootstrap-extensions</artifactId>
+            <version>0.9.2</version>
+            <optional>true</optional>
+        </dependency>
+    </dependencies>
+
+    <build>
+        <resources>
+            <resource>
+                <directory>${project.parent.basedir}${file.separator}conf</directory>
+            </resource>
+        </resources>
+        <plugins>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-jar-plugin</artifactId>
+                <version>2.6</version>
+                <executions>
+                    <execution>
+                        <goals>
+                            <goal>test-jar</goal>
+                        </goals>
+                    </execution>
+                </executions>
+            </plugin>
+            <plugin>
+                <artifactId>maven-resources-plugin</artifactId>
+                <version>3.0.1</version>
+                <executions>
+                    <execution>
+                        <id>copy-resources</id>
+                        <phase>package</phase>
+                        <goals>
+                            <goal>copy-resources</goal>
+                        </goals>
+                        <configuration>
+                            <outputDirectory>${libs.dir}</outputDirectory>
+                            <resources>
+                                <resource>
+                                    <directory>${project.build.directory}</directory>
+                                    <include>${build.finalName}.jar</include>
+                                </resource>
+                                <resource>
+                                    <directory>${project.basedir}</directory>
+                                    <include>plugin.xml</include>
+                                </resource>
+                            </resources>
+                        </configuration>
+                    </execution>
+                </executions>
+            </plugin>
+            <plugin>
+                <artifactId>maven-surefire-plugin</artifactId>
+                <version>2.19.1</version>
+                <configuration>
+                    <excludedGroups>org.apache.nutch.test.IntegrationTest</excludedGroups>
+                </configuration>
+            </plugin>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-failsafe-plugin</artifactId>
+                <version>2.19.1</version>
+                <configuration>
+                    <systemPropertyVariables>
+                        <plugin.folders>../runtime/local/plugins</plugin.folders>
+                    </systemPropertyVariables>
+                </configuration>
+            </plugin>
+        </plugins>
+    </build>
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/crawl/AbstractFetchSchedule.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/crawl/AbstractFetchSchedule.java b/nutch-core/src/main/java/org/apache/nutch/crawl/AbstractFetchSchedule.java
new file mode 100755
index 0000000..c259419
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/crawl/AbstractFetchSchedule.java
@@ -0,0 +1,227 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+
+/**
+ * This class provides common methods for implementations of
+ * <code>FetchSchedule</code>.
+ * 
+ * @author Andrzej Bialecki
+ */
+public abstract class AbstractFetchSchedule extends Configured implements
+    FetchSchedule {
+  private static final Logger LOG = LoggerFactory
+      .getLogger(AbstractFetchSchedule.class);
+
+  protected int defaultInterval;
+  protected int maxInterval;
+
+  public AbstractFetchSchedule() {
+    super(null);
+  }
+
+  public AbstractFetchSchedule(Configuration conf) {
+    super(conf);
+  }
+
+  public void setConf(Configuration conf) {
+    super.setConf(conf);
+    if (conf == null)
+      return;
+    defaultInterval = conf.getInt("db.fetch.interval.default", 0);
+    maxInterval = conf.getInt("db.fetch.interval.max", 0);
+    LOG.info("defaultInterval=" + defaultInterval);
+    LOG.info("maxInterval=" + maxInterval);
+  }
+
+  /**
+   * Initialize fetch schedule related data. Implementations should at least set
+   * the <code>fetchTime</code> and <code>fetchInterval</code>. The default
+   * implementation sets the <code>fetchTime</code> to now, using the default
+   * <code>fetchInterval</code>.
+   * 
+   * @param url
+   *          URL of the page.
+   * 
+   * @param datum
+   *          datum instance to be initialized (modified in place).
+   */
+  public CrawlDatum initializeSchedule(Text url, CrawlDatum datum) {
+    datum.setFetchTime(System.currentTimeMillis());
+    datum.setFetchInterval(defaultInterval);
+    datum.setRetriesSinceFetch(0);
+    return datum;
+  }
+
+  /**
+   * Sets the <code>fetchInterval</code> and <code>fetchTime</code> on a
+   * successfully fetched page. NOTE: this implementation resets the retry
+   * counter - extending classes should call super.setFetchSchedule() to
+   * preserve this behavior.
+   */
+  public CrawlDatum setFetchSchedule(Text url, CrawlDatum datum,
+      long prevFetchTime, long prevModifiedTime, long fetchTime,
+      long modifiedTime, int state) {
+    datum.setRetriesSinceFetch(0);
+    return datum;
+  }
+
+  /**
+   * This method specifies how to schedule refetching of pages marked as GONE.
+   * Default implementation increases fetchInterval by 50% but the value may
+   * never exceed <code>maxInterval</code>.
+   * 
+   * @param url
+   *          URL of the page.
+   * 
+   * @param datum
+   *          datum instance to be adjusted.
+   * 
+   * @return adjusted page information, including all original information.
+   *         NOTE: this may be a different instance than @see CrawlDatum, but
+   *         implementations should make sure that it contains at least all
+   *         information from @see CrawlDatum.
+   */
+  public CrawlDatum setPageGoneSchedule(Text url, CrawlDatum datum,
+      long prevFetchTime, long prevModifiedTime, long fetchTime) {
+    // no page is truly GONE ... just increase the interval by 50%
+    // and try much later.
+    if ((datum.getFetchInterval() * 1.5f) < maxInterval)
+      datum.setFetchInterval(datum.getFetchInterval() * 1.5f);
+    else
+      datum.setFetchInterval(maxInterval * 0.9f);
+    datum.setFetchTime(fetchTime + (long) datum.getFetchInterval() * 1000);
+    return datum;
+  }
+
+  /**
+   * This method adjusts the fetch schedule if fetching needs to be re-tried due
+   * to transient errors. The default implementation sets the next fetch time 1
+   * day in the future and increases the retry counter.
+   * 
+   * @param url
+   *          URL of the page.
+   * 
+   * @param datum
+   *          page information.
+   * 
+   * @param prevFetchTime
+   *          previous fetch time.
+   * 
+   * @param prevModifiedTime
+   *          previous modified time.
+   * 
+   * @param fetchTime
+   *          current fetch time.
+   * 
+   * @return adjusted page information, including all original information.
+   *         NOTE: this may be a different instance than @see CrawlDatum, but
+   *         implementations should make sure that it contains at least all
+   *         information from @see CrawlDatum.
+   */
+  public CrawlDatum setPageRetrySchedule(Text url, CrawlDatum datum,
+      long prevFetchTime, long prevModifiedTime, long fetchTime) {
+    datum.setFetchTime(fetchTime + (long) SECONDS_PER_DAY * 1000);
+    datum.setRetriesSinceFetch(datum.getRetriesSinceFetch() + 1);
+    return datum;
+  }
+
+  /**
+   * This method return the last fetch time of the CrawlDatum
+   * 
+   * @return the date as a long.
+   */
+  public long calculateLastFetchTime(CrawlDatum datum) {
+    return datum.getFetchTime() - (long) datum.getFetchInterval() * 1000;
+  }
+
+  /**
+   * This method provides information whether the page is suitable for selection
+   * in the current fetchlist. NOTE: a true return value does not guarantee that
+   * the page will be fetched, it just allows it to be included in the further
+   * selection process based on scores. The default implementation checks
+   * <code>fetchTime</code>, if it is higher than the <code>curTime</code> it
+   * returns false, and true otherwise. It will also check that fetchTime is not
+   * too remote (more than <code>maxInterval</code>, in which case it lowers the
+   * interval and returns true.
+   * 
+   * @param url
+   *          URL of the page.
+   * 
+   * @param datum
+   *          datum instance.
+   * 
+   * @param curTime
+   *          reference time (usually set to the time when the fetchlist
+   *          generation process was started).
+   * 
+   * @return true, if the page should be considered for inclusion in the current
+   *         fetchlist, otherwise false.
+   */
+  public boolean shouldFetch(Text url, CrawlDatum datum, long curTime) {
+    // pages are never truly GONE - we have to check them from time to time.
+    // pages with too long fetchInterval are adjusted so that they fit within
+    // maximum fetchInterval (segment retention period).
+    if (datum.getFetchTime() - curTime > (long) maxInterval * 1000) {
+      if (datum.getFetchInterval() > maxInterval) {
+        datum.setFetchInterval(maxInterval * 0.9f);
+      }
+      datum.setFetchTime(curTime);
+    }
+    if (datum.getFetchTime() > curTime) {
+      return false; // not time yet
+    }
+    return true;
+  }
+
+  /**
+   * This method resets fetchTime, fetchInterval, modifiedTime,
+   * retriesSinceFetch and page signature, so that it forces refetching.
+   * 
+   * @param url
+   *          URL of the page.
+   * 
+   * @param datum
+   *          datum instance.
+   * 
+   * @param asap
+   *          if true, force refetch as soon as possible - this sets the
+   *          fetchTime to now. If false, force refetch whenever the next fetch
+   *          time is set.
+   */
+  public CrawlDatum forceRefetch(Text url, CrawlDatum datum, boolean asap) {
+    // reduce fetchInterval so that it fits within the max value
+    if (datum.getFetchInterval() > maxInterval)
+      datum.setFetchInterval(maxInterval * 0.9f);
+    datum.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
+    datum.setRetriesSinceFetch(0);
+    datum.setSignature(null);
+    datum.setModifiedTime(0L);
+    if (asap)
+      datum.setFetchTime(System.currentTimeMillis());
+    return datum;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java b/nutch-core/src/main/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
new file mode 100755
index 0000000..08cad34
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
@@ -0,0 +1,203 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.FloatWritable;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.util.NutchConfiguration;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This class implements an adaptive re-fetch algorithm. This works as follows:
+ * <ul>
+ * <li>for pages that has changed since the last fetchTime, decrease their
+ * fetchInterval by a factor of DEC_FACTOR (default value is 0.2f).</li>
+ * <li>for pages that haven't changed since the last fetchTime, increase their
+ * fetchInterval by a factor of INC_FACTOR (default value is 0.2f).<br>
+ * If SYNC_DELTA property is true, then:
+ * <ul>
+ * <li>calculate a <code>delta = fetchTime - modifiedTime</code></li>
+ * <li>try to synchronize with the time of change, by shifting the next
+ * fetchTime by a fraction of the difference between the last modification time
+ * and the last fetch time. I.e. the next fetch time will be set to
+ * <code>fetchTime + fetchInterval - delta * SYNC_DELTA_RATE</code></li>
+ * <li>if the adjusted fetch interval is bigger than the delta, then
+ * <code>fetchInterval = delta</code>.</li>
+ * </ul>
+ * </li>
+ * <li>the minimum value of fetchInterval may not be smaller than MIN_INTERVAL
+ * (default is 1 minute).</li>
+ * <li>the maximum value of fetchInterval may not be bigger than MAX_INTERVAL
+ * (default is 365 days).</li>
+ * </ul>
+ * <p>
+ * NOTE: values of DEC_FACTOR and INC_FACTOR higher than 0.4f may destabilize
+ * the algorithm, so that the fetch interval either increases or decreases
+ * infinitely, with little relevance to the page changes. Please use
+ * {@link #main(String[])} method to test the values before applying them in a
+ * production system.
+ * </p>
+ * 
+ * @author Andrzej Bialecki
+ */
+public class AdaptiveFetchSchedule extends AbstractFetchSchedule {
+
+  // Loggg
+  public static final Logger LOG = LoggerFactory
+      .getLogger(AbstractFetchSchedule.class);
+
+  protected float INC_RATE;
+
+  protected float DEC_RATE;
+
+  private float MAX_INTERVAL;
+
+  private float MIN_INTERVAL;
+
+  private boolean SYNC_DELTA;
+
+  private double SYNC_DELTA_RATE;
+
+  public void setConf(Configuration conf) {
+    super.setConf(conf);
+    if (conf == null)
+      return;
+    INC_RATE = conf.getFloat("db.fetch.schedule.adaptive.inc_rate", 0.2f);
+    DEC_RATE = conf.getFloat("db.fetch.schedule.adaptive.dec_rate", 0.2f);
+    MIN_INTERVAL = conf.getFloat("db.fetch.schedule.adaptive.min_interval", (float) 60.0);
+    MAX_INTERVAL = conf.getFloat("db.fetch.schedule.adaptive.max_interval",
+        (float) SECONDS_PER_DAY * 365); // 1 year
+    SYNC_DELTA = conf.getBoolean("db.fetch.schedule.adaptive.sync_delta", true);
+    SYNC_DELTA_RATE = conf.getFloat(
+        "db.fetch.schedule.adaptive.sync_delta_rate", 0.2f);
+  }
+
+  @Override
+  public CrawlDatum setFetchSchedule(Text url, CrawlDatum datum,
+      long prevFetchTime, long prevModifiedTime, long fetchTime,
+      long modifiedTime, int state) {
+    super.setFetchSchedule(url, datum, prevFetchTime, prevModifiedTime,
+        fetchTime, modifiedTime, state);
+
+    float interval = datum.getFetchInterval();
+    long refTime = fetchTime;
+
+    // https://issues.apache.org/jira/browse/NUTCH-1430
+    interval = (interval == 0) ? defaultInterval : interval;
+
+    if (datum.getMetaData().containsKey(Nutch.WRITABLE_FIXED_INTERVAL_KEY)) {
+      // Is fetch interval preset in CrawlDatum MD? Then use preset interval
+      FloatWritable customIntervalWritable = (FloatWritable) (datum
+          .getMetaData().get(Nutch.WRITABLE_FIXED_INTERVAL_KEY));
+      interval = customIntervalWritable.get();
+    } else {
+      if (modifiedTime <= 0)
+        modifiedTime = fetchTime;
+      switch (state) {
+      case FetchSchedule.STATUS_MODIFIED:
+        interval *= (1.0f - DEC_RATE);
+        break;
+      case FetchSchedule.STATUS_NOTMODIFIED:
+        interval *= (1.0f + INC_RATE);
+        break;
+      case FetchSchedule.STATUS_UNKNOWN:
+        break;
+      }
+      if (SYNC_DELTA) {
+        // try to synchronize with the time of change
+        long delta = (fetchTime - modifiedTime) / 1000L;
+        if (delta > interval)
+          interval = delta;
+        refTime = fetchTime - Math.round(delta * SYNC_DELTA_RATE * 1000);
+      }
+      if (interval < MIN_INTERVAL) {
+        interval = MIN_INTERVAL;
+      } else if (interval > MAX_INTERVAL) {
+        interval = MAX_INTERVAL;
+      }
+    }
+
+    datum.setFetchInterval(interval);
+    datum.setFetchTime(refTime + Math.round(interval * 1000.0));
+    datum.setModifiedTime(modifiedTime);
+    return datum;
+  }
+
+  public static void main(String[] args) throws Exception {
+    FetchSchedule fs = new AdaptiveFetchSchedule();
+    fs.setConf(NutchConfiguration.create());
+    // we start the time at 0, for simplicity
+    long curTime = 0;
+    long delta = 1000L * 3600L * 24L; // 2 hours
+    // we trigger the update of the page every 30 days
+    long update = 1000L * 3600L * 24L * 30L; // 30 days
+    boolean changed = true;
+    long lastModified = 0;
+    int miss = 0;
+    int totalMiss = 0;
+    int maxMiss = 0;
+    int fetchCnt = 0;
+    int changeCnt = 0;
+    // initial fetchInterval is 10 days
+    CrawlDatum p = new CrawlDatum(1, 3600 * 24 * 30, 1.0f);
+    p.setFetchTime(0);
+    LOG.info(p.toString());
+    // let's move the timeline a couple of deltas
+    for (int i = 0; i < 10000; i++) {
+      if (lastModified + update < curTime) {
+        // System.out.println("i=" + i + ", lastModified=" + lastModified +
+        // ", update=" + update + ", curTime=" + curTime);
+        changed = true;
+        changeCnt++;
+        lastModified = curTime;
+      }
+      LOG.info(i + ". " + changed + "\twill fetch at "
+          + (p.getFetchTime() / delta) + "\tinterval "
+          + (p.getFetchInterval() / SECONDS_PER_DAY) + " days" + "\t missed "
+          + miss);
+      if (p.getFetchTime() <= curTime) {
+        fetchCnt++;
+        fs.setFetchSchedule(new Text("http://www.example.com"), p, p
+            .getFetchTime(), p.getModifiedTime(), curTime, lastModified,
+            changed ? FetchSchedule.STATUS_MODIFIED
+                : FetchSchedule.STATUS_NOTMODIFIED);
+        LOG.info("\tfetched & adjusted: " + "\twill fetch at "
+            + (p.getFetchTime() / delta) + "\tinterval "
+            + (p.getFetchInterval() / SECONDS_PER_DAY) + " days");
+        if (!changed)
+          miss++;
+        if (miss > maxMiss)
+          maxMiss = miss;
+        changed = false;
+        totalMiss += miss;
+        miss = 0;
+      }
+      if (changed)
+        miss++;
+      curTime += delta;
+    }
+    LOG.info("Total missed: " + totalMiss + ", max miss: " + maxMiss);
+    LOG.info("Page changed " + changeCnt + " times, fetched " + fetchCnt
+        + " times.");
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/crawl/CrawlDatum.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/crawl/CrawlDatum.java b/nutch-core/src/main/java/org/apache/nutch/crawl/CrawlDatum.java
new file mode 100644
index 0000000..7fe3e1e
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/crawl/CrawlDatum.java
@@ -0,0 +1,572 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import java.io.*;
+import java.util.*;
+import java.util.Map.Entry;
+
+import org.apache.commons.jexl2.JexlContext;
+import org.apache.commons.jexl2.Expression;
+import org.apache.commons.jexl2.JexlEngine;
+import org.apache.commons.jexl2.MapContext;
+
+import org.apache.hadoop.io.*;
+import org.apache.nutch.util.*;
+
+/* The crawl state of a url. */
+public class CrawlDatum implements WritableComparable<CrawlDatum>, Cloneable {
+  public static final String GENERATE_DIR_NAME = "crawl_generate";
+  public static final String FETCH_DIR_NAME = "crawl_fetch";
+  public static final String PARSE_DIR_NAME = "crawl_parse";
+
+  private final static byte CUR_VERSION = 7;
+
+  /** Compatibility values for on-the-fly conversion from versions < 5. */
+  private static final byte OLD_STATUS_SIGNATURE = 0;
+  private static final byte OLD_STATUS_DB_UNFETCHED = 1;
+  private static final byte OLD_STATUS_DB_FETCHED = 2;
+  private static final byte OLD_STATUS_DB_GONE = 3;
+  private static final byte OLD_STATUS_LINKED = 4;
+  private static final byte OLD_STATUS_FETCH_SUCCESS = 5;
+  private static final byte OLD_STATUS_FETCH_RETRY = 6;
+  private static final byte OLD_STATUS_FETCH_GONE = 7;
+
+  private static HashMap<Byte, Byte> oldToNew = new HashMap<Byte, Byte>();
+
+  /** Page was not fetched yet. */
+  public static final byte STATUS_DB_UNFETCHED = 0x01;
+  /** Page was successfully fetched. */
+  public static final byte STATUS_DB_FETCHED = 0x02;
+  /** Page no longer exists. */
+  public static final byte STATUS_DB_GONE = 0x03;
+  /** Page temporarily redirects to other page. */
+  public static final byte STATUS_DB_REDIR_TEMP = 0x04;
+  /** Page permanently redirects to other page. */
+  public static final byte STATUS_DB_REDIR_PERM = 0x05;
+  /** Page was successfully fetched and found not modified. */
+  public static final byte STATUS_DB_NOTMODIFIED = 0x06;
+  public static final byte STATUS_DB_DUPLICATE = 0x07;
+
+  /** Maximum value of DB-related status. */
+  public static final byte STATUS_DB_MAX = 0x1f;
+
+  /** Fetching was successful. */
+  public static final byte STATUS_FETCH_SUCCESS = 0x21;
+  /** Fetching unsuccessful, needs to be retried (transient errors). */
+  public static final byte STATUS_FETCH_RETRY = 0x22;
+  /** Fetching temporarily redirected to other page. */
+  public static final byte STATUS_FETCH_REDIR_TEMP = 0x23;
+  /** Fetching permanently redirected to other page. */
+  public static final byte STATUS_FETCH_REDIR_PERM = 0x24;
+  /** Fetching unsuccessful - page is gone. */
+  public static final byte STATUS_FETCH_GONE = 0x25;
+  /** Fetching successful - page is not modified. */
+  public static final byte STATUS_FETCH_NOTMODIFIED = 0x26;
+
+  /** Maximum value of fetch-related status. */
+  public static final byte STATUS_FETCH_MAX = 0x3f;
+
+  /** Page signature. */
+  public static final byte STATUS_SIGNATURE = 0x41;
+  /** Page was newly injected. */
+  public static final byte STATUS_INJECTED = 0x42;
+  /** Page discovered through a link. */
+  public static final byte STATUS_LINKED = 0x43;
+  /** Page got metadata from a parser */
+  public static final byte STATUS_PARSE_META = 0x44;
+
+  public static final HashMap<Byte, String> statNames = new HashMap<Byte, String>();
+  static {
+    statNames.put(STATUS_DB_UNFETCHED, "db_unfetched");
+    statNames.put(STATUS_DB_FETCHED, "db_fetched");
+    statNames.put(STATUS_DB_GONE, "db_gone");
+    statNames.put(STATUS_DB_REDIR_TEMP, "db_redir_temp");
+    statNames.put(STATUS_DB_REDIR_PERM, "db_redir_perm");
+    statNames.put(STATUS_DB_NOTMODIFIED, "db_notmodified");
+    statNames.put(STATUS_DB_DUPLICATE, "db_duplicate");
+    statNames.put(STATUS_SIGNATURE, "signature");
+    statNames.put(STATUS_INJECTED, "injected");
+    statNames.put(STATUS_LINKED, "linked");
+    statNames.put(STATUS_FETCH_SUCCESS, "fetch_success");
+    statNames.put(STATUS_FETCH_RETRY, "fetch_retry");
+    statNames.put(STATUS_FETCH_REDIR_TEMP, "fetch_redir_temp");
+    statNames.put(STATUS_FETCH_REDIR_PERM, "fetch_redir_perm");
+    statNames.put(STATUS_FETCH_GONE, "fetch_gone");
+    statNames.put(STATUS_FETCH_NOTMODIFIED, "fetch_notmodified");
+    statNames.put(STATUS_PARSE_META, "parse_metadata");
+
+    oldToNew.put(OLD_STATUS_DB_UNFETCHED, STATUS_DB_UNFETCHED);
+    oldToNew.put(OLD_STATUS_DB_FETCHED, STATUS_DB_FETCHED);
+    oldToNew.put(OLD_STATUS_DB_GONE, STATUS_DB_GONE);
+    oldToNew.put(OLD_STATUS_FETCH_GONE, STATUS_FETCH_GONE);
+    oldToNew.put(OLD_STATUS_FETCH_SUCCESS, STATUS_FETCH_SUCCESS);
+    oldToNew.put(OLD_STATUS_FETCH_RETRY, STATUS_FETCH_RETRY);
+    oldToNew.put(OLD_STATUS_LINKED, STATUS_LINKED);
+    oldToNew.put(OLD_STATUS_SIGNATURE, STATUS_SIGNATURE);
+  }
+
+  private byte status;
+  private long fetchTime = System.currentTimeMillis();
+  private byte retries;
+  private int fetchInterval;
+  private float score = 0.0f;
+  private byte[] signature = null;
+  private long modifiedTime;
+  private org.apache.hadoop.io.MapWritable metaData;
+
+  public static boolean hasDbStatus(CrawlDatum datum) {
+    if (datum.status <= STATUS_DB_MAX)
+      return true;
+    return false;
+  }
+
+  public static boolean hasFetchStatus(CrawlDatum datum) {
+    if (datum.status > STATUS_DB_MAX && datum.status <= STATUS_FETCH_MAX)
+      return true;
+    return false;
+  }
+
+  public CrawlDatum() {
+  }
+
+  public CrawlDatum(int status, int fetchInterval) {
+    this();
+    this.status = (byte) status;
+    this.fetchInterval = fetchInterval;
+  }
+
+  public CrawlDatum(int status, int fetchInterval, float score) {
+    this(status, fetchInterval);
+    this.score = score;
+  }
+
+  //
+  // accessor methods
+  //
+
+  public byte getStatus() {
+    return status;
+  }
+
+  public static String getStatusName(byte value) {
+    String res = statNames.get(value);
+    if (res == null)
+      res = "unknown";
+    return res;
+  }
+
+  public void setStatus(int status) {
+    this.status = (byte) status;
+  }
+
+  /**
+   * Returns either the time of the last fetch, or the next fetch time,
+   * depending on whether Fetcher or CrawlDbReducer set the time.
+   */
+  public long getFetchTime() {
+    return fetchTime;
+  }
+
+  /**
+   * Sets either the time of the last fetch or the next fetch time, depending on
+   * whether Fetcher or CrawlDbReducer set the time.
+   */
+  public void setFetchTime(long fetchTime) {
+    this.fetchTime = fetchTime;
+  }
+
+  public long getModifiedTime() {
+    return modifiedTime;
+  }
+
+  public void setModifiedTime(long modifiedTime) {
+    this.modifiedTime = modifiedTime;
+  }
+
+  public byte getRetriesSinceFetch() {
+    return retries;
+  }
+
+  public void setRetriesSinceFetch(int retries) {
+    this.retries = (byte) retries;
+  }
+
+  public int getFetchInterval() {
+    return fetchInterval;
+  }
+
+  public void setFetchInterval(int fetchInterval) {
+    this.fetchInterval = fetchInterval;
+  }
+
+  public void setFetchInterval(float fetchInterval) {
+    this.fetchInterval = Math.round(fetchInterval);
+  }
+
+  public float getScore() {
+    return score;
+  }
+
+  public void setScore(float score) {
+    this.score = score;
+  }
+
+  public byte[] getSignature() {
+    return signature;
+  }
+
+  public void setSignature(byte[] signature) {
+    if (signature != null && signature.length > 256)
+      throw new RuntimeException("Max signature length (256) exceeded: "
+          + signature.length);
+    this.signature = signature;
+  }
+
+  public void setMetaData(org.apache.hadoop.io.MapWritable mapWritable) {
+    this.metaData = new org.apache.hadoop.io.MapWritable(mapWritable);
+  }
+
+  /**
+   * Add all metadata from other CrawlDatum to this CrawlDatum.
+   * 
+   * @param other
+   *          CrawlDatum
+   */
+  public void putAllMetaData(CrawlDatum other) {
+    for (Entry<Writable, Writable> e : other.getMetaData().entrySet()) {
+      getMetaData().put(e.getKey(), e.getValue());
+    }
+  }
+
+  /**
+   * returns a MapWritable if it was set or read in @see readFields(DataInput),
+   * returns empty map in case CrawlDatum was freshly created (lazily
+   * instantiated).
+   */
+  public org.apache.hadoop.io.MapWritable getMetaData() {
+    if (this.metaData == null)
+      this.metaData = new org.apache.hadoop.io.MapWritable();
+    return this.metaData;
+  }
+
+  //
+  // writable methods
+  //
+
+  public static CrawlDatum read(DataInput in) throws IOException {
+    CrawlDatum result = new CrawlDatum();
+    result.readFields(in);
+    return result;
+  }
+
+  public void readFields(DataInput in) throws IOException {
+    byte version = in.readByte(); // read version
+    if (version > CUR_VERSION) // check version
+      throw new VersionMismatchException(CUR_VERSION, version);
+
+    status = in.readByte();
+    fetchTime = in.readLong();
+    retries = in.readByte();
+    if (version > 5) {
+      fetchInterval = in.readInt();
+    } else
+      fetchInterval = Math.round(in.readFloat());
+    score = in.readFloat();
+    if (version > 2) {
+      modifiedTime = in.readLong();
+      int cnt = in.readByte();
+      if (cnt > 0) {
+        signature = new byte[cnt];
+        in.readFully(signature);
+      } else
+        signature = null;
+    }
+
+    if (version > 3) {
+      boolean hasMetadata = false;
+      if (version < 7) {
+        org.apache.hadoop.io.MapWritable oldMetaData = new org.apache.hadoop.io.MapWritable();
+        if (in.readBoolean()) {
+          hasMetadata = true;
+          metaData = new org.apache.hadoop.io.MapWritable();
+          oldMetaData.readFields(in);
+        }
+        for (Writable key : oldMetaData.keySet()) {
+          metaData.put(key, oldMetaData.get(key));
+        }
+      } else {
+        if (in.readBoolean()) {
+          hasMetadata = true;
+          metaData = new org.apache.hadoop.io.MapWritable();
+          metaData.readFields(in);
+        }
+      }
+      if (hasMetadata == false)
+        metaData = null;
+    }
+    // translate status codes
+    if (version < 5) {
+      if (oldToNew.containsKey(status))
+        status = oldToNew.get(status);
+      else
+        status = STATUS_DB_UNFETCHED;
+
+    }
+  }
+
+  /** The number of bytes into a CrawlDatum that the score is stored. */
+  private static final int SCORE_OFFSET = 1 + 1 + 8 + 1 + 4;
+  private static final int SIG_OFFSET = SCORE_OFFSET + 4 + 8;
+
+  public void write(DataOutput out) throws IOException {
+    out.writeByte(CUR_VERSION); // store current version
+    out.writeByte(status);
+    out.writeLong(fetchTime);
+    out.writeByte(retries);
+    out.writeInt(fetchInterval);
+    out.writeFloat(score);
+    out.writeLong(modifiedTime);
+    if (signature == null) {
+      out.writeByte(0);
+    } else {
+      out.writeByte(signature.length);
+      out.write(signature);
+    }
+    if (metaData != null && metaData.size() > 0) {
+      out.writeBoolean(true);
+      metaData.write(out);
+    } else {
+      out.writeBoolean(false);
+    }
+  }
+
+  /** Copy the contents of another instance into this instance. */
+  public void set(CrawlDatum that) {
+    this.status = that.status;
+    this.fetchTime = that.fetchTime;
+    this.retries = that.retries;
+    this.fetchInterval = that.fetchInterval;
+    this.score = that.score;
+    this.modifiedTime = that.modifiedTime;
+    this.signature = that.signature;
+    if (that.metaData != null) {
+      this.metaData = new org.apache.hadoop.io.MapWritable(that.metaData); // make
+                                                                           // a
+                                                                           // deep
+                                                                           // copy
+    } else {
+      this.metaData = null;
+    }
+  }
+
+  //
+  // compare methods
+  //
+
+  /** Sort by decreasing score. */
+  public int compareTo(CrawlDatum that) {
+    if (that.score != this.score)
+      return (that.score - this.score) > 0 ? 1 : -1;
+    if (that.status != this.status)
+      return this.status - that.status;
+    if (that.fetchTime != this.fetchTime)
+      return (that.fetchTime - this.fetchTime) > 0 ? 1 : -1;
+    if (that.retries != this.retries)
+      return that.retries - this.retries;
+    if (that.fetchInterval != this.fetchInterval)
+      return (that.fetchInterval - this.fetchInterval) > 0 ? 1 : -1;
+    if (that.modifiedTime != this.modifiedTime)
+      return (that.modifiedTime - this.modifiedTime) > 0 ? 1 : -1;
+    return SignatureComparator._compare(this, that);
+  }
+
+  /** A Comparator optimized for CrawlDatum. */
+  public static class Comparator extends WritableComparator {
+    public Comparator() {
+      super(CrawlDatum.class);
+    }
+
+    public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
+      float score1 = readFloat(b1, s1 + SCORE_OFFSET);
+      float score2 = readFloat(b2, s2 + SCORE_OFFSET);
+      if (score2 != score1) {
+        return (score2 - score1) > 0 ? 1 : -1;
+      }
+      int status1 = b1[s1 + 1];
+      int status2 = b2[s2 + 1];
+      if (status2 != status1)
+        return status1 - status2;
+      long fetchTime1 = readLong(b1, s1 + 1 + 1);
+      long fetchTime2 = readLong(b2, s2 + 1 + 1);
+      if (fetchTime2 != fetchTime1)
+        return (fetchTime2 - fetchTime1) > 0 ? 1 : -1;
+      int retries1 = b1[s1 + 1 + 1 + 8];
+      int retries2 = b2[s2 + 1 + 1 + 8];
+      if (retries2 != retries1)
+        return retries2 - retries1;
+      int fetchInterval1 = readInt(b1, s1 + 1 + 1 + 8 + 1);
+      int fetchInterval2 = readInt(b2, s2 + 1 + 1 + 8 + 1);
+      if (fetchInterval2 != fetchInterval1)
+        return (fetchInterval2 - fetchInterval1) > 0 ? 1 : -1;
+      long modifiedTime1 = readLong(b1, s1 + SCORE_OFFSET + 4);
+      long modifiedTime2 = readLong(b2, s2 + SCORE_OFFSET + 4);
+      if (modifiedTime2 != modifiedTime1)
+        return (modifiedTime2 - modifiedTime1) > 0 ? 1 : -1;
+      int sigl1 = b1[s1 + SIG_OFFSET];
+      int sigl2 = b2[s2 + SIG_OFFSET];
+      return SignatureComparator._compare(b1, SIG_OFFSET, sigl1, b2,
+          SIG_OFFSET, sigl2);
+    }
+  }
+
+  static { // register this comparator
+    WritableComparator.define(CrawlDatum.class, new Comparator());
+  }
+
+  //
+  // basic methods
+  //
+
+  public String toString() {
+    StringBuilder buf = new StringBuilder();
+    buf.append("Version: " + CUR_VERSION + "\n");
+    buf.append("Status: " + getStatus() + " (" + getStatusName(getStatus())
+        + ")\n");
+    buf.append("Fetch time: " + new Date(getFetchTime()) + "\n");
+    buf.append("Modified time: " + new Date(getModifiedTime()) + "\n");
+    buf.append("Retries since fetch: " + getRetriesSinceFetch() + "\n");
+    buf.append("Retry interval: " + getFetchInterval() + " seconds ("
+        + (getFetchInterval() / FetchSchedule.SECONDS_PER_DAY) + " days)\n");
+    buf.append("Score: " + getScore() + "\n");
+    buf.append("Signature: " + StringUtil.toHexString(getSignature()) + "\n");
+    buf.append("Metadata: \n ");
+    if (metaData != null) {
+      for (Entry<Writable, Writable> e : metaData.entrySet()) {
+        buf.append("\t");
+        buf.append(e.getKey());
+        buf.append("=");
+        buf.append(e.getValue());
+        buf.append("\n");
+      }
+    }
+    return buf.toString();
+  }
+
+  private boolean metadataEquals(org.apache.hadoop.io.MapWritable otherMetaData) {
+    if (metaData == null || metaData.size() == 0) {
+      return otherMetaData == null || otherMetaData.size() == 0;
+    }
+    if (otherMetaData == null) {
+      // we already know that the current object is not null or empty
+      return false;
+    }
+    HashSet<Entry<Writable, Writable>> set1 = new HashSet<Entry<Writable, Writable>>(
+        metaData.entrySet());
+    HashSet<Entry<Writable, Writable>> set2 = new HashSet<Entry<Writable, Writable>>(
+        otherMetaData.entrySet());
+    return set1.equals(set2);
+  }
+
+  public boolean equals(Object o) {
+    if (!(o instanceof CrawlDatum))
+      return false;
+    CrawlDatum other = (CrawlDatum) o;
+    boolean res = (this.status == other.status)
+        && (this.fetchTime == other.fetchTime)
+        && (this.modifiedTime == other.modifiedTime)
+        && (this.retries == other.retries)
+        && (this.fetchInterval == other.fetchInterval)
+        && (SignatureComparator._compare(this.signature, other.signature) == 0)
+        && (this.score == other.score);
+    if (!res)
+      return res;
+    return metadataEquals(other.metaData);
+  }
+
+  public int hashCode() {
+    int res = 0;
+    if (signature != null) {
+      for (int i = 0; i < signature.length / 4; i += 4) {
+        res ^= (signature[i] << 24 + signature[i + 1] << 16 + signature[i + 2] << 8 + signature[i + 3]);
+      }
+    }
+    if (metaData != null) {
+      res ^= metaData.entrySet().hashCode();
+    }
+    return res ^ status ^ ((int) fetchTime) ^ ((int) modifiedTime) ^ retries
+        ^ fetchInterval ^ Float.floatToIntBits(score);
+  }
+
+  public Object clone() {
+    try {
+      return super.clone();
+    } catch (CloneNotSupportedException e) {
+      throw new RuntimeException(e);
+    }
+  }
+  
+  public boolean evaluate(Expression expr) {
+    if (expr != null) {
+      // Create a context and add data
+      JexlContext jcontext = new MapContext();
+      
+      // https://issues.apache.org/jira/browse/NUTCH-2229
+      jcontext.set("status", getStatusName(getStatus()));
+      jcontext.set("fetchTime", (long)(getFetchTime()));
+      jcontext.set("modifiedTime", (long)(getModifiedTime()));
+      jcontext.set("retries", getRetriesSinceFetch());
+      jcontext.set("interval", new Integer(getFetchInterval()));
+      jcontext.set("score", getScore());
+      jcontext.set("signature", StringUtil.toHexString(getSignature()));
+            
+      // Set metadata variables
+      for (Map.Entry<Writable, Writable> entry : getMetaData().entrySet()) {
+        Object value = entry.getValue();
+        
+        if (value instanceof FloatWritable) {
+          FloatWritable fvalue = (FloatWritable)value;
+          Text tkey = (Text)entry.getKey();
+          jcontext.set(tkey.toString(), fvalue.get());
+        }
+        
+        if (value instanceof IntWritable) {
+          IntWritable ivalue = (IntWritable)value;
+          Text tkey = (Text)entry.getKey();
+          jcontext.set(tkey.toString(), ivalue.get());
+        }
+        
+        if (value instanceof Text) {
+          Text tvalue = (Text)value;
+          Text tkey = (Text)entry.getKey();     
+          jcontext.set(tkey.toString().replace("-", "_"), tvalue.toString());
+        }
+      }
+                  
+      try {
+        if (Boolean.TRUE.equals(expr.evaluate(jcontext))) {
+          return true;
+        }
+      } catch (Exception e) {
+        //
+      }
+    }
+
+    return false;
+  }
+}
\ No newline at end of file