You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by th...@apache.org on 2016/07/16 19:49:06 UTC
[50/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build
for nutch-core and nutch-plugins
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/.gitignore
----------------------------------------------------------------------
diff --git a/.gitignore b/.gitignore
index 5b3c687..7a70f9d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,3 +6,9 @@ build/
runtime/
logs/
/bin/
+
+*.class
+target/
+nutch-core/target
+nutch-plugins/target
+nutch-plugins/*/target
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/bin/crawl
----------------------------------------------------------------------
diff --git a/bin/crawl b/bin/crawl
new file mode 100755
index 0000000..567d35e
--- /dev/null
+++ b/bin/crawl
@@ -0,0 +1,281 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Usage: crawl [-i|--index] [-D "key=value"] [-w|--wait] <Seed Dir> <Crawl Dir> <Num Rounds>
+# -i|--index Indexes crawl results into a configured indexer
+# -w|--wait NUMBER[SUFFIX] Time to wait before generating a new segment when no URLs
+# are scheduled for fetching. Suffix can be: s for second,
+# m for minute, h for hour and d for day. If no suffix is
+# specified second is used by default.
+# -D A Java property to pass to Nutch calls
+# Seed Dir Directory in which to look for a seeds file
+# Crawl Dir Directory where the crawl/link/segments dirs are saved
+# Num Rounds The number of rounds to run this crawl for
+#
+#
+# UNLIKE THE NUTCH ALL-IN-ONE-CRAWL COMMAND THIS SCRIPT DOES THE LINK INVERSION AND
+# INDEXING FOR EACH SEGMENT
+
+INDEXFLAG=false
+JAVA_PROPERTIES=""
+WAIT=-1 # don't wait if there are no URLs to fetch
+
+function __to_seconds() {
+ NUMBER=$(echo $1 | tr -dc '0-9')
+ MODIFIER=$(echo $1 | tr -dc '[^s|h|m|d]]')
+
+ case $MODIFIER in
+ m|M)
+ SECONDS=`expr $NUMBER \* 60`
+ ;;
+ h|H)
+ SECONDS=`expr $NUMBER \* 120`
+ ;;
+ d|D)
+ SECONDS=`expr $NUMBER \* 86400`
+ ;;
+ s|S|*)
+ SECONDS=$NUMBER
+ ;;
+ esac
+
+ echo $SECONDS
+}
+
+while [[ $# > 0 ]]
+do
+ case $1 in
+ -i|--index)
+ INDEXFLAG=true
+ shift
+ ;;
+ -D)
+ JAVA_PROPERTIES="-D${2} ${JAVA_PROPERTIES}"
+ shift 2
+ ;;
+ -w|--wait)
+ WAIT="${2}"
+ shift 2
+ ;;
+ *)
+ break
+ ;;
+ esac
+done
+
+if [[ $# != 3 ]]; then
+ echo "Usage: crawl [-i|--index] [-D \"key=value\"] [-w|--wait] <Seed Dir> <Crawl Dir> <Num Rounds>"
+ echo -e "\t-i|--index\tIndexes crawl results into a configured indexer"
+ echo -e "\t-D\t\tA Java property to pass to Nutch calls"
+ echo -e "\t-w|--wait\tNUMBER[SUFFIX] Time to wait before generating a new segment when no URLs"
+ echo -e "\t\t\tare scheduled for fetching. Suffix can be: s for second,"
+ echo -e "\t\t\tm for minute, h for hour and d for day. If no suffix is"
+ echo -e "\t\t\tspecified second is used by default."
+ echo -e "\tSeed Dir\tDirectory in which to look for a seeds file"
+ echo -e "\tCrawl Dir\tDirectory where the crawl/link/segments dirs are saved"
+ echo -e "\tNum Rounds\tThe number of rounds to run this crawl for"
+ exit 1
+fi
+
+SEEDDIR="$1"
+CRAWL_PATH="$2"
+LIMIT="$3"
+
+# convert wait time to seconds for compatibility reasons
+if [ "$WAIT" != "-1" ]; then
+ WAIT=$( __to_seconds "$WAIT" )
+ echo "Time to wait (--wait) = $WAIT sec."
+fi
+
+#############################################
+# MODIFY THE PARAMETERS BELOW TO YOUR NEEDS #
+#############################################
+
+# set the number of slaves nodes
+numSlaves=1
+
+# and the total number of available tasks
+# sets Hadoop parameter "mapreduce.job.reduces"
+numTasks=`expr $numSlaves \* 2`
+
+# number of urls to fetch in one iteration
+# 250K per task?
+sizeFetchlist=`expr $numSlaves \* 50000`
+
+# time limit for feching
+timeLimitFetch=180
+
+# num threads for fetching
+numThreads=50
+
+#############################################
+
+bin="`dirname "$0"`"
+bin="`cd "$bin"; pwd`"
+
+# determines whether mode based on presence of job file
+mode=local
+if [ -f "${bin}"/../*nutch*.job ]; then
+ mode=distributed
+fi
+
+# note that some of the options listed here could be set in the
+# corresponding hadoop site xml param file
+commonOptions="-D mapreduce.job.reduces=$numTasks -D mapred.child.java.opts=-Xmx1000m -D mapreduce.reduce.speculative=false -D mapreduce.map.speculative=false -D mapreduce.map.output.compress=true"
+
+ # check that hadoop can be found on the path
+if [ $mode = "distributed" ]; then
+ if [ $(which hadoop | wc -l ) -eq 0 ]; then
+ echo "Can't find Hadoop executable. Add HADOOP_COMMON_HOME/bin to the path or run in local mode."
+ exit -1;
+ fi
+fi
+
+
+function __bin_nutch {
+ # run $bin/nutch, exit if exit value indicates error
+
+ echo "$bin/nutch $@" ;# echo command and arguments
+ "$bin/nutch" "$@"
+
+ RETCODE=$?
+ if [ $RETCODE -ne 0 ]
+ then
+ echo "Error running:"
+ echo " $bin/nutch $@"
+ echo "Failed with exit value $RETCODE."
+ exit $RETCODE
+ fi
+}
+
+
+
+# initial injection
+echo "Injecting seed URLs"
+__bin_nutch inject "$CRAWL_PATH"/crawldb "$SEEDDIR"
+
+# main loop : rounds of generate - fetch - parse - update
+for ((a=1; ; a++))
+do
+ if [ -e ".STOP" ]
+ then
+ echo "STOP file found - escaping loop"
+ break
+ fi
+
+ if [ $LIMIT -ne -1 ]; then
+ if [ $a -gt $LIMIT ]; then
+ echo `date` ": Finished loop with $LIMIT iterations"
+ break
+ fi
+ echo `date` ": Iteration $a of $LIMIT"
+ else
+ echo `date` ": Iteration $a"
+ fi
+
+ echo "Generating a new segment"
+ generate_args=($commonOptions "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments -topN $sizeFetchlist -numFetchers $numSlaves -noFilter)
+ echo "$bin/nutch generate ${generate_args[@]}"
+ $bin/nutch generate "${generate_args[@]}"
+ RETCODE=$?
+ if [ $RETCODE -eq 0 ]; then
+ : # ok: no error
+ elif [ $RETCODE -eq 1 ]; then
+ echo "Generate returned 1 (no new segments created)"
+
+ if [ "$WAIT" -ne -1 ]; then
+ echo "Waiting for $WAIT sec. ..."
+ sleep $WAIT
+ continue
+ else
+ echo "Escaping loop: no more URLs to fetch now"
+ break
+ fi
+ else
+ echo "Error running:"
+ echo " $bin/nutch generate ${generate_args[@]}"
+ echo "Failed with exit value $RETCODE."
+ exit $RETCODE
+ fi
+
+ # capture the name of the segment
+ # call hadoop in distributed mode
+ # or use ls
+
+ if [ $mode = "local" ]; then
+ SEGMENT=`ls "$CRAWL_PATH"/segments/ | sort -n | tail -n 1`
+ else
+ SEGMENT=`hadoop fs -ls "$CRAWL_PATH"/segments/ | grep segments | sed -e "s/\//\\n/g" | egrep 20[0-9]+ | sort -n | tail -n 1`
+ fi
+
+ echo "Operating on segment : $SEGMENT"
+
+ # fetching the segment
+ echo "Fetching : $SEGMENT"
+ __bin_nutch fetch $commonOptions -D fetcher.timelimit.mins=$timeLimitFetch "$CRAWL_PATH"/segments/$SEGMENT -noParsing -threads $numThreads
+
+ # parsing the segment
+ echo "Parsing : $SEGMENT"
+ # enable the skipping of records for the parsing so that a dodgy document
+ # so that it does not fail the full task
+ skipRecordsOptions="-D mapreduce.task.skip.start.attempts=2 -D mapreduce.map.skip.maxrecords=1"
+ __bin_nutch parse $commonOptions $skipRecordsOptions "$CRAWL_PATH"/segments/$SEGMENT
+
+ # updatedb with this segment
+ echo "CrawlDB update"
+ __bin_nutch updatedb $commonOptions "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments/$SEGMENT
+
+# note that the link inversion - indexing routine can be done within the main loop
+# on a per segment basis
+ echo "Link inversion"
+ __bin_nutch invertlinks "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT
+
+ echo "Dedup on crawldb"
+ __bin_nutch dedup "$CRAWL_PATH"/crawldb
+
+ if $INDEXFLAG; then
+ echo "Indexing $SEGMENT to index"
+ __bin_nutch index $JAVA_PROPERTIES "$CRAWL_PATH"/crawldb -linkdb "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT
+
+ echo "Cleaning up index if possible"
+ __bin_nutch clean $JAVA_PROPERTIES "$CRAWL_PATH"/crawldb
+ else
+ echo "Skipping indexing ..."
+ fi
+
+ #######################################################
+ # The following commands fall into WebGraph territory
+ # and should be uncommented based on your requirements
+ #######################################################
+ #echo "Building WebGraph within $CRAWL_PATH on all segments in $CRAWL_PATH/segments/"
+ #__bin_nutch webgraph $commonOptions -filter -normalize -segmentDir "$CRAWL_PATH"/segments/ -webgraphdb "$CRAWL_PATH"
+
+ #echo "Running Loops Job on WebGraph within $CRAWL_PATH"
+ #__bin_nutch org.apache.nutch.scoring.webgraph.Loops $commonOptions -webgraphdb "$CRAWL_PATH"
+
+ #echo "Running LinkRank Algorithm on WebGraph within $CRAWL_PATH"
+ #__bin_nutch linkrank $commonOptions -webgraphdb "$CRAWL_PATH"
+
+ #echo "Running ScoreUpdater Job with $CRAWL_PATH/crawldb and WebGraph within $CRAWL_PATH"
+ #__bin_nutch scoreupdater $commonOptions -crawldb "$CRAWL_PATH"/crawldb -webgraphdb "$CRAWL_PATH"
+
+ #echo "Running NodeDumper on WebGraph within $CRAWL_PATH and dumping output to $CRAWL_PATH/dump/scores"
+ #__bin_nutch nodedumper $commonOptions -scores -topn 1000 -webgraphdb "$CRAWL_PATH" -output "$CRAWL_PATH"/dump/scores
+
+done
+
+exit 0
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/bin/nutch
----------------------------------------------------------------------
diff --git a/bin/nutch b/bin/nutch
new file mode 100755
index 0000000..1649069
--- /dev/null
+++ b/bin/nutch
@@ -0,0 +1,324 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# The Nutch command script
+#
+# Environment Variables
+#
+# NUTCH_JAVA_HOME The java implementation to use. Overrides JAVA_HOME.
+#
+# NUTCH_HEAPSIZE The maximum amount of heap to use, in MB.
+# Default is 1000.
+#
+# NUTCH_OPTS Extra Java runtime options.
+# Multiple options must be separated by white space.
+#
+# NUTCH_LOG_DIR Log directory (default: $NUTCH_HOME/logs)
+#
+# NUTCH_LOGFILE Log file (default: hadoop.log)
+#
+# NUTCH_CONF_DIR Path(s) to configuration files (default: $NUTCH_HOME/conf).
+# Multiple paths must be separated by a colon ':'.
+#
+cygwin=false
+case "`uname`" in
+CYGWIN*) cygwin=true;;
+esac
+
+# resolve links - $0 may be a softlink
+THIS="$0"
+while [ -h "$THIS" ]; do
+ ls=`ls -ld "$THIS"`
+ link=`expr "$ls" : '.*-> \(.*\)$'`
+ if expr "$link" : '.*/.*' > /dev/null; then
+ THIS="$link"
+ else
+ THIS=`dirname "$THIS"`/"$link"
+ fi
+done
+
+# if no args specified, show usage
+if [ $# = 0 ]; then
+ echo "nutch 1.12"
+ echo "Usage: nutch COMMAND"
+ echo "where COMMAND is one of:"
+ echo " readdb read / dump crawl db"
+ echo " mergedb merge crawldb-s, with optional filtering"
+ echo " readlinkdb read / dump link db"
+ echo " inject inject new urls into the database"
+ echo " generate generate new segments to fetch from crawl db"
+ echo " freegen generate new segments to fetch from text files"
+ echo " fetch fetch a segment's pages"
+ echo " parse parse a segment's pages"
+ echo " readseg read / dump segment data"
+ echo " mergesegs merge several segments, with optional filtering and slicing"
+ echo " updatedb update crawl db from segments after fetching"
+ echo " invertlinks create a linkdb from parsed segments"
+ echo " mergelinkdb merge linkdb-s, with optional filtering"
+ echo " index run the plugin-based indexer on parsed segments and linkdb"
+ echo " dedup deduplicate entries in the crawldb and give them a special status"
+ echo " dump exports crawled data from segments into files"
+ echo " commoncrawldump exports crawled data from segments into common crawl data format encoded as CBOR"
+ echo " solrindex run the solr indexer on parsed segments and linkdb - DEPRECATED use the index command instead"
+ echo " solrdedup remove duplicates from solr - DEPRECATED use the dedup command instead"
+ echo " solrclean remove HTTP 301 and 404 documents from solr - DEPRECATED use the clean command instead"
+ echo " clean remove HTTP 301 and 404 documents and duplicates from indexing backends configured via plugins"
+ echo " parsechecker check the parser for a given url"
+ echo " indexchecker check the indexing filters for a given url"
+ echo " filterchecker check url filters for a given url"
+ echo " normalizerchecker check url normalizers for a given url"
+ echo " domainstats calculate domain statistics from crawldb"
+ echo " protocolstats calculate protocol status code stats from crawldb"
+ echo " crawlcomplete calculate crawl completion stats from crawldb"
+ echo " webgraph generate a web graph from existing segments"
+ echo " linkrank run a link analysis program on the generated web graph"
+ echo " scoreupdater updates the crawldb with linkrank scores"
+ echo " nodedumper dumps the web graph's node scores"
+ echo " plugin load a plugin and run one of its classes main()"
+ echo " junit runs the given JUnit test"
+ echo " startserver runs the Nutch Server on localhost:8081"
+ echo " webapp run a local Nutch Web Application on locahost:8080"
+ echo " warc exports crawled data from segments at the WARC format"
+ echo " updatehostdb update the host db with records from the crawl db"
+ echo " readhostdb read / dump host db"
+ echo " or"
+ echo " CLASSNAME run the class named CLASSNAME"
+ echo "Most commands print help when invoked w/o parameters."
+ exit 1
+fi
+
+# get arguments
+COMMAND=$1
+shift
+
+# some directories
+THIS_DIR="`dirname "$THIS"`"
+NUTCH_HOME="`cd "$THIS_DIR/.." ; pwd`"
+
+# some Java parameters
+if [ "$NUTCH_JAVA_HOME" != "" ]; then
+ #echo "run java in $NUTCH_JAVA_HOME"
+ JAVA_HOME="$NUTCH_JAVA_HOME"
+fi
+
+if [ "$JAVA_HOME" = "" ]; then
+ echo "Error: JAVA_HOME is not set."
+ exit 1
+fi
+
+local=true
+
+# NUTCH_JOB
+if [ -f "${NUTCH_HOME}"/*nutch*.job ]; then
+ local=false
+ for f in "$NUTCH_HOME"/*nutch*.job; do
+ NUTCH_JOB="$f"
+ done
+ # cygwin path translation
+ if $cygwin; then
+ NUTCH_JOB="`cygpath -p -w "$NUTCH_JOB"`"
+ fi
+fi
+
+JAVA="$JAVA_HOME/bin/java"
+JAVA_HEAP_MAX=-Xmx1000m
+
+# check envvars which might override default args
+if [ "$NUTCH_HEAPSIZE" != "" ]; then
+ #echo "run with heapsize $NUTCH_HEAPSIZE"
+ JAVA_HEAP_MAX="-Xmx""$NUTCH_HEAPSIZE""m"
+ #echo $JAVA_HEAP_MAX
+fi
+
+# CLASSPATH initially contains $NUTCH_CONF_DIR, or defaults to $NUTCH_HOME/conf
+CLASSPATH="${NUTCH_CONF_DIR:=$NUTCH_HOME/conf}"
+CLASSPATH="${CLASSPATH}:$JAVA_HOME/lib/tools.jar"
+
+# so that filenames w/ spaces are handled correctly in loops below
+IFS=
+
+# add libs to CLASSPATH
+if $local; then
+ for f in "$NUTCH_HOME"/lib/*.jar; do
+ CLASSPATH="${CLASSPATH}:$f";
+ done
+ # local runtime
+ # add plugins to classpath
+ if [ -d "$NUTCH_HOME/plugins" ]; then
+ CLASSPATH="${NUTCH_HOME}:${CLASSPATH}"
+ fi
+fi
+
+# cygwin path translation
+if $cygwin; then
+ CLASSPATH="`cygpath -p -w "$CLASSPATH"`"
+fi
+
+# setup 'java.library.path' for native-hadoop code if necessary
+# used only in local mode
+JAVA_LIBRARY_PATH=''
+if [ -d "${NUTCH_HOME}/lib/native" ]; then
+
+ JAVA_PLATFORM=`"${JAVA}" -classpath "$CLASSPATH" org.apache.hadoop.util.PlatformName | sed -e 's/ /_/g'`
+
+ if [ -d "${NUTCH_HOME}/lib/native" ]; then
+ if [ "x$JAVA_LIBRARY_PATH" != "x" ]; then
+ JAVA_LIBRARY_PATH="${JAVA_LIBRARY_PATH}:${NUTCH_HOME}/lib/native/${JAVA_PLATFORM}"
+ else
+ JAVA_LIBRARY_PATH="${NUTCH_HOME}/lib/native/${JAVA_PLATFORM}"
+ fi
+ fi
+fi
+
+if [ $cygwin = true -a "X${JAVA_LIBRARY_PATH}" != "X" ]; then
+ JAVA_LIBRARY_PATH="`cygpath -p -w "$JAVA_LIBRARY_PATH"`"
+fi
+
+# restore ordinary behaviour
+unset IFS
+
+# default log directory & file
+if [ "$NUTCH_LOG_DIR" = "" ]; then
+ NUTCH_LOG_DIR="$NUTCH_HOME/logs"
+fi
+if [ "$NUTCH_LOGFILE" = "" ]; then
+ NUTCH_LOGFILE='hadoop.log'
+fi
+
+#Fix log path under cygwin
+if $cygwin; then
+ NUTCH_LOG_DIR="`cygpath -p -w "$NUTCH_LOG_DIR"`"
+fi
+
+NUTCH_OPTS=($NUTCH_OPTS -Dhadoop.log.dir="$NUTCH_LOG_DIR")
+NUTCH_OPTS=("${NUTCH_OPTS[@]}" -Dhadoop.log.file="$NUTCH_LOGFILE")
+
+if [ "x$JAVA_LIBRARY_PATH" != "x" ]; then
+ NUTCH_OPTS=("${NUTCH_OPTS[@]}" -Djava.library.path="$JAVA_LIBRARY_PATH")
+fi
+
+# figure out which class to run
+if [ "$COMMAND" = "crawl" ] ; then
+ echo "Command $COMMAND is deprecated, please use bin/crawl instead"
+ exit -1
+elif [ "$COMMAND" = "inject" ] ; then
+ CLASS=org.apache.nutch.crawl.Injector
+elif [ "$COMMAND" = "generate" ] ; then
+ CLASS=org.apache.nutch.crawl.Generator
+elif [ "$COMMAND" = "freegen" ] ; then
+ CLASS=org.apache.nutch.tools.FreeGenerator
+elif [ "$COMMAND" = "fetch" ] ; then
+ CLASS=org.apache.nutch.fetcher.Fetcher
+elif [ "$COMMAND" = "parse" ] ; then
+ CLASS=org.apache.nutch.parse.ParseSegment
+elif [ "$COMMAND" = "readdb" ] ; then
+ CLASS=org.apache.nutch.crawl.CrawlDbReader
+elif [ "$COMMAND" = "mergedb" ] ; then
+ CLASS=org.apache.nutch.crawl.CrawlDbMerger
+elif [ "$COMMAND" = "readlinkdb" ] ; then
+ CLASS=org.apache.nutch.crawl.LinkDbReader
+elif [ "$COMMAND" = "readseg" ] ; then
+ CLASS=org.apache.nutch.segment.SegmentReader
+elif [ "$COMMAND" = "mergesegs" ] ; then
+ CLASS=org.apache.nutch.segment.SegmentMerger
+elif [ "$COMMAND" = "updatedb" ] ; then
+ CLASS=org.apache.nutch.crawl.CrawlDb
+elif [ "$COMMAND" = "invertlinks" ] ; then
+ CLASS=org.apache.nutch.crawl.LinkDb
+elif [ "$COMMAND" = "mergelinkdb" ] ; then
+ CLASS=org.apache.nutch.crawl.LinkDbMerger
+elif [ "$COMMAND" = "dump" ] ; then
+ CLASS=org.apache.nutch.tools.FileDumper
+elif [ "$COMMAND" = "commoncrawldump" ] ; then
+ CLASS=org.apache.nutch.tools.CommonCrawlDataDumper
+elif [ "$COMMAND" = "solrindex" ] ; then
+ CLASS="org.apache.nutch.indexer.IndexingJob -D solr.server.url=$1"
+ shift
+elif [ "$COMMAND" = "index" ] ; then
+ CLASS=org.apache.nutch.indexer.IndexingJob
+elif [ "$COMMAND" = "solrdedup" ] ; then
+ echo "Command $COMMAND is deprecated, please use dedup instead"
+ exit -1
+elif [ "$COMMAND" = "dedup" ] ; then
+ CLASS=org.apache.nutch.crawl.DeduplicationJob
+elif [ "$COMMAND" = "solrclean" ] ; then
+ CLASS="org.apache.nutch.indexer.CleaningJob -D solr.server.url=$2 $1"
+ shift; shift
+elif [ "$COMMAND" = "clean" ] ; then
+ CLASS=org.apache.nutch.indexer.CleaningJob
+elif [ "$COMMAND" = "parsechecker" ] ; then
+ CLASS=org.apache.nutch.parse.ParserChecker
+elif [ "$COMMAND" = "indexchecker" ] ; then
+ CLASS=org.apache.nutch.indexer.IndexingFiltersChecker
+elif [ "$COMMAND" = "filterchecker" ] ; then
+ CLASS=org.apache.nutch.net.URLFilterChecker
+elif [ "$COMMAND" = "normalizerchecker" ] ; then
+ CLASS=org.apache.nutch.net.URLNormalizerChecker
+elif [ "$COMMAND" = "domainstats" ] ; then
+ CLASS=org.apache.nutch.util.domain.DomainStatistics
+elif [ "$COMMAND" = "protocolstats" ] ; then
+ CLASS=org.apache.nutch.util.ProtocolStatusStatistics
+elif [ "$COMMAND" = "crawlcomplete" ] ; then
+ CLASS=org.apache.nutch.util.CrawlCompletionStats
+elif [ "$COMMAND" = "webgraph" ] ; then
+ CLASS=org.apache.nutch.scoring.webgraph.WebGraph
+elif [ "$COMMAND" = "linkrank" ] ; then
+ CLASS=org.apache.nutch.scoring.webgraph.LinkRank
+elif [ "$COMMAND" = "scoreupdater" ] ; then
+ CLASS=org.apache.nutch.scoring.webgraph.ScoreUpdater
+elif [ "$COMMAND" = "nodedumper" ] ; then
+ CLASS=org.apache.nutch.scoring.webgraph.NodeDumper
+elif [ "$COMMAND" = "plugin" ] ; then
+ CLASS=org.apache.nutch.plugin.PluginRepository
+elif [ "$COMMAND" = "junit" ] ; then
+ CLASSPATH="$CLASSPATH:$NUTCH_HOME/test/classes/"
+ if $local; then
+ for f in "$NUTCH_HOME"/test/lib/*.jar; do
+ CLASSPATH="${CLASSPATH}:$f";
+ done
+ fi
+ CLASS=org.junit.runner.JUnitCore
+elif [ "$COMMAND" = "startserver" ] ; then
+ CLASS=org.apache.nutch.service.NutchServer
+elif [ "$COMMAND" = "webapp" ] ; then
+ CLASS=org.apache.nutch.webui.NutchUiServer
+elif [ "$COMMAND" = "warc" ] ; then
+ CLASS=org.apache.nutch.tools.warc.WARCExporter
+elif [ "$COMMAND" = "updatehostdb" ] ; then
+ CLASS=org.apache.nutch.hostdb.UpdateHostDb
+elif [ "$COMMAND" = "readhostdb" ] ; then
+ CLASS=org.apache.nutch.hostdb.ReadHostDb
+else
+ CLASS=$COMMAND
+fi
+
+# distributed mode
+EXEC_CALL=(hadoop jar "$NUTCH_JOB")
+
+if $local; then
+ EXEC_CALL=("$JAVA" $JAVA_HEAP_MAX "${NUTCH_OPTS[@]}" -classpath "$CLASSPATH")
+else
+ # check that hadoop can be found on the path
+ if [ $(which hadoop | wc -l ) -eq 0 ]; then
+ echo "Can't find Hadoop executable. Add HADOOP_COMMON_HOME/bin to the path or run in local mode."
+ exit -1;
+ fi
+fi
+
+# run it
+exec "${EXEC_CALL[@]}" $CLASS "$@"
+
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-core/pom.xml b/nutch-core/pom.xml
new file mode 100644
index 0000000..62e2e58
--- /dev/null
+++ b/nutch-core/pom.xml
@@ -0,0 +1,522 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.nutch</groupId>
+ <artifactId>nutch-parent</artifactId>
+ <version>1.13-SNAPSHOT</version>
+ <relativePath>../pom.xml</relativePath>
+ </parent>
+ <artifactId>nutch-core</artifactId>
+ <packaging>jar</packaging>
+
+ <name>Apache Nutch</name>
+ <description>Nutch is an open source web-search software.
+ It builds on Hadoop, Tika and Solr, adding web-specifics,
+ such as a crawler, a link-graph database etc.
+ </description>
+ <url>http://nutch.apache.org</url>
+ <licenses>
+ <license>
+ <name>The Apache Software License, Version 2.0</name>
+ <url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
+ <distribution>repo</distribution>
+ </license>
+ </licenses>
+
+ <scm>
+ <developerConnection>scm:git:https://git-wip-us.apache.org/repos/asf/nutch.git</developerConnection>
+ <connection>scm:git:http://git-wip-us.apache.org/repos/asf/nutch.git</connection>
+ <url>https://git-wip-us.apache.org/repos/asf/nutch.git</url>
+ </scm>
+
+ <pluginRepositories>
+ <pluginRepository>
+ <id>miredot</id>
+ <name>MireDot Releases</name>
+ <url>http://nexus.qmino.com/content/repositories/miredot</url>
+ </pluginRepository>
+ </pluginRepositories>
+
+ <developers>
+ <developer>
+ <id>mattmann</id>
+ <name>Chris A. Mattmann</name>
+ <email>mattmann@apache.org</email>
+ </developer>
+ <developer>
+ <id>jnioche</id>
+ <name>Julien Nioche</name>
+ <email>jnioche@apache.org</email>
+ </developer>
+ <developer>
+ <id>lewismc</id>
+ <name>Lewis John McGibbney</name>
+ <email>lewismc@apache.org</email>
+ </developer>
+ <developer>
+ <id>markus</id>
+ <name>Markus Jelsma</name>
+ <email>markus@apache.org</email>
+ </developer>
+ <developer>
+ <id>fenglu</id>
+ <name>Feng Lu</name>
+ <email>fenglu@apache.org</email>
+ </developer>
+ <developer>
+ <id>kiranch</id>
+ <name>Kiran Chitturi</name>
+ <email>kiranch@apache.org</email>
+ </developer>
+ <developer>
+ <id>tejasp</id>
+ <name>Tejas Patil</name>
+ <email>tejasp@apache.org</email>
+ </developer>
+ <developer>
+ <id>talat</id>
+ <name>Talat Uyarer</name>
+ <email>talat@apache.org</email>
+ </developer>
+ <developer>
+ <id>snagel</id>
+ <name>Sebastian Nagel</name>
+ <email>snagel@apache.org</email>
+ </developer>
+ <developer>
+ <id>thammegowda</id>
+ <name>Thamme Gowda</name>
+ <email>thammegowda@apache.org</email>
+ </developer>
+ </developers>
+
+ <properties>
+ <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+ <slf4j.version>1.7.12</slf4j.version>
+ <junit.version>4.12</junit.version>
+ <dir.root>${project.parent.basedir}</dir.root>
+ <libs.dir>${dir.local}${file.separator}lib</libs.dir>
+ </properties>
+
+ <dependencies>
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-api</artifactId>
+ <version>${slf4j.version}</version>
+ <optional>true</optional>
+ </dependency>
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-log4j12</artifactId>
+ <version>${slf4j.version}</version>
+ <optional>true</optional>
+ </dependency>
+ <dependency>
+ <groupId>commons-lang</groupId>
+ <artifactId>commons-lang</artifactId>
+ <version>2.6</version>
+ <optional>true</optional>
+ </dependency>
+ <dependency>
+ <groupId>commons-collections</groupId>
+ <artifactId>commons-collections</artifactId>
+ <version>3.2.1</version>
+ <optional>true</optional>
+ </dependency>
+ <dependency>
+ <groupId>commons-httpclient</groupId>
+ <artifactId>commons-httpclient</artifactId>
+ <version>3.1</version>
+ <optional>true</optional>
+ </dependency>
+ <dependency>
+ <groupId>commons-codec</groupId>
+ <artifactId>commons-codec</artifactId>
+ <version>1.10</version>
+ <optional>true</optional>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.commons</groupId>
+ <artifactId>commons-compress</artifactId>
+ <version>1.9</version>
+ <optional>true</optional>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.commons</groupId>
+ <artifactId>commons-jexl</artifactId>
+ <version>2.1.1</version>
+ <scope>compile</scope>
+ </dependency>
+ <dependency>
+ <groupId>com.tdunning</groupId>
+ <artifactId>t-digest</artifactId>
+ <version>3.1</version>
+ <scope>compile</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.hadoop</groupId>
+ <artifactId>hadoop-common</artifactId>
+ <version>2.4.0</version>
+ <optional>true</optional>
+ <exclusions>
+ <exclusion>
+ <groupId>hsqldb</groupId>
+ <artifactId>*</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>net.sf.kosmosfs</groupId>
+ <artifactId>*</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>net.java.dev.jets3t</groupId>
+ <artifactId>*</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>org.eclipse.jdt</groupId>
+ <artifactId>*</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>org.mortbay.jetty</groupId>
+ <artifactId>*</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>ant</groupId>
+ <artifactId>*</artifactId>
+ </exclusion>
+ </exclusions>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.hadoop</groupId>
+ <artifactId>hadoop-hdfs</artifactId>
+ <version>2.4.0</version>
+ <optional>true</optional>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.hadoop</groupId>
+ <artifactId>hadoop-mapreduce-client-core</artifactId>
+ <version>2.4.0</version>
+ <optional>true</optional>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.hadoop</groupId>
+ <artifactId>hadoop-mapreduce-client-jobclient</artifactId>
+ <version>2.4.0</version>
+ <optional>true</optional>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-core</artifactId>
+ <version>1.12</version>
+ <scope>compile</scope>
+ </dependency>
+ <dependency>
+ <groupId>com.ibm.icu</groupId>
+ <artifactId>icu4j</artifactId>
+ <version>55.1</version>
+ <scope>compile</scope>
+ </dependency>
+ <dependency>
+ <groupId>xerces</groupId>
+ <artifactId>xercesImpl</artifactId>
+ <version>2.11.0</version>
+ <scope>compile</scope>
+ </dependency>
+ <dependency>
+ <groupId>xerces</groupId>
+ <artifactId>xmlParserAPIs</artifactId>
+ <version>2.6.2</version>
+ <scope>compile</scope>
+ </dependency>
+ <dependency>
+ <groupId>oro</groupId>
+ <artifactId>oro</artifactId>
+ <version>2.0.8</version>
+ <scope>compile</scope>
+ </dependency>
+ <dependency>
+ <groupId>com.google.guava</groupId>
+ <artifactId>guava</artifactId>
+ <version>16.0.1</version>
+ <scope>compile</scope>
+ </dependency>
+ <dependency>
+ <groupId>com.github.crawler-commons</groupId>
+ <artifactId>crawler-commons</artifactId>
+ <version>0.6</version>
+ <scope>compile</scope>
+ </dependency>
+ <dependency>
+ <groupId>com.martinkl.warc</groupId>
+ <artifactId>warc-hadoop</artifactId>
+ <version>0.1.0</version>
+ <scope>compile</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.cxf</groupId>
+ <artifactId>cxf-rt-frontend-jaxws</artifactId>
+ <version>3.0.4</version>
+ <optional>true</optional>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.cxf</groupId>
+ <artifactId>cxf-rt-frontend-jaxrs</artifactId>
+ <version>3.0.4</version>
+ <optional>true</optional>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.cxf</groupId>
+ <artifactId>cxf-rt-transports-http</artifactId>
+ <version>3.0.4</version>
+ <optional>true</optional>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.cxf</groupId>
+ <artifactId>cxf-rt-transports-http-jetty</artifactId>
+ <version>3.0.4</version>
+ <optional>true</optional>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.cxf</groupId>
+ <artifactId>cxf-rt-rs-client</artifactId>
+ <version>3.0.4</version>
+ <optional>true</optional>
+ </dependency>
+ <dependency>
+ <groupId>com.fasterxml.jackson.core</groupId>
+ <artifactId>jackson-databind</artifactId>
+ <version>2.5.1</version>
+ <optional>true</optional>
+ </dependency>
+ <dependency>
+ <groupId>com.fasterxml.jackson.dataformat</groupId>
+ <artifactId>jackson-dataformat-cbor</artifactId>
+ <version>2.5.1</version>
+ <optional>true</optional>
+ </dependency>
+ <dependency>
+ <groupId>com.fasterxml.jackson.jaxrs</groupId>
+ <artifactId>jackson-jaxrs-json-provider</artifactId>
+ <version>2.5.1</version>
+ <optional>true</optional>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.lucene</groupId>
+ <artifactId>lucene-analyzers-common</artifactId>
+ <version>4.10.2</version>
+ <optional>true</optional>
+ </dependency>
+ <dependency>
+ <groupId>org.netpreserve.commons</groupId>
+ <artifactId>webarchive-commons</artifactId>
+ <version>1.1.5</version>
+ <optional>true</optional>
+ <exclusions>
+ <exclusion>
+ <groupId>*</groupId>
+ <artifactId>hadoop-core</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>com.google.guava</groupId>
+ <artifactId>*</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>junit</groupId>
+ <artifactId>*</artifactId>
+ </exclusion>
+ </exclusions>
+ </dependency>
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <version>4.11</version>
+ <optional>true</optional>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.mrunit</groupId>
+ <artifactId>mrunit</artifactId>
+ <version>1.1.0</version>
+ <classifier>hadoop2</classifier>
+ <optional>true</optional>
+ <exclusions>
+ <exclusion>
+ <groupId>log4j</groupId>
+ <artifactId>log4j</artifactId>
+ </exclusion>
+ </exclusions>
+ </dependency>
+ <dependency>
+ <groupId>org.mortbay.jetty</groupId>
+ <artifactId>jetty-client</artifactId>
+ <version>6.1.22</version>
+ <optional>true</optional>
+ </dependency>
+ <dependency>
+ <groupId>org.mortbay.jetty</groupId>
+ <artifactId>jetty</artifactId>
+ <version>6.1.22</version>
+ <optional>true</optional>
+ </dependency>
+ <dependency>
+ <groupId>org.mortbay.jetty</groupId>
+ <artifactId>jetty-util</artifactId>
+ <version>6.1.22</version>
+ <optional>true</optional>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.commons</groupId>
+ <artifactId>commons-collections4</artifactId>
+ <version>4.0</version>
+ <optional>true</optional>
+ </dependency>
+ <dependency>
+ <groupId>org.springframework</groupId>
+ <artifactId>spring-core</artifactId>
+ <version>4.0.4.RELEASE</version>
+ <optional>true</optional>
+ </dependency>
+ <dependency>
+ <groupId>org.springframework</groupId>
+ <artifactId>spring-context</artifactId>
+ <version>4.0.4.RELEASE</version>
+ <optional>true</optional>
+ </dependency>
+ <dependency>
+ <groupId>org.springframework</groupId>
+ <artifactId>spring-web</artifactId>
+ <version>4.0.4.RELEASE</version>
+ <optional>true</optional>
+ </dependency>
+ <dependency>
+ <groupId>com.sun.jersey</groupId>
+ <artifactId>jersey-client</artifactId>
+ <version>1.8</version>
+ <optional>true</optional>
+ </dependency>
+ <dependency>
+ <groupId>com.j256.ormlite</groupId>
+ <artifactId>ormlite-jdbc</artifactId>
+ <version>4.48</version>
+ <optional>true</optional>
+ </dependency>
+ <dependency>
+ <groupId>com.h2database</groupId>
+ <artifactId>h2</artifactId>
+ <version>1.4.180</version>
+ <optional>true</optional>
+ </dependency>
+ <dependency>
+ <groupId>org.eclipse.persistence</groupId>
+ <artifactId>javax.persistence</artifactId>
+ <version>2.0.0</version>
+ <optional>true</optional>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.wicket</groupId>
+ <artifactId>wicket-core</artifactId>
+ <version>6.16.0</version>
+ <optional>true</optional>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.wicket</groupId>
+ <artifactId>wicket-spring</artifactId>
+ <version>6.16.0</version>
+ <optional>true</optional>
+ </dependency>
+ <dependency>
+ <groupId>de.agilecoders.wicket</groupId>
+ <artifactId>wicket-bootstrap-core</artifactId>
+ <version>0.9.2</version>
+ <optional>true</optional>
+ </dependency>
+ <dependency>
+ <groupId>de.agilecoders.wicket</groupId>
+ <artifactId>wicket-bootstrap-extensions</artifactId>
+ <version>0.9.2</version>
+ <optional>true</optional>
+ </dependency>
+ </dependencies>
+
+ <build>
+ <resources>
+ <resource>
+ <directory>${project.parent.basedir}${file.separator}conf</directory>
+ </resource>
+ </resources>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-jar-plugin</artifactId>
+ <version>2.6</version>
+ <executions>
+ <execution>
+ <goals>
+ <goal>test-jar</goal>
+ </goals>
+ </execution>
+ </executions>
+ </plugin>
+ <plugin>
+ <artifactId>maven-resources-plugin</artifactId>
+ <version>3.0.1</version>
+ <executions>
+ <execution>
+ <id>copy-resources</id>
+ <phase>package</phase>
+ <goals>
+ <goal>copy-resources</goal>
+ </goals>
+ <configuration>
+ <outputDirectory>${libs.dir}</outputDirectory>
+ <resources>
+ <resource>
+ <directory>${project.build.directory}</directory>
+ <include>${build.finalName}.jar</include>
+ </resource>
+ <resource>
+ <directory>${project.basedir}</directory>
+ <include>plugin.xml</include>
+ </resource>
+ </resources>
+ </configuration>
+ </execution>
+ </executions>
+ </plugin>
+ <plugin>
+ <artifactId>maven-surefire-plugin</artifactId>
+ <version>2.19.1</version>
+ <configuration>
+ <excludedGroups>org.apache.nutch.test.IntegrationTest</excludedGroups>
+ </configuration>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-failsafe-plugin</artifactId>
+ <version>2.19.1</version>
+ <configuration>
+ <systemPropertyVariables>
+ <plugin.folders>../runtime/local/plugins</plugin.folders>
+ </systemPropertyVariables>
+ </configuration>
+ </plugin>
+ </plugins>
+ </build>
+</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/crawl/AbstractFetchSchedule.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/crawl/AbstractFetchSchedule.java b/nutch-core/src/main/java/org/apache/nutch/crawl/AbstractFetchSchedule.java
new file mode 100755
index 0000000..c259419
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/crawl/AbstractFetchSchedule.java
@@ -0,0 +1,227 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+
+/**
+ * This class provides common methods for implementations of
+ * <code>FetchSchedule</code>.
+ *
+ * @author Andrzej Bialecki
+ */
+public abstract class AbstractFetchSchedule extends Configured implements
+ FetchSchedule {
+ private static final Logger LOG = LoggerFactory
+ .getLogger(AbstractFetchSchedule.class);
+
+ protected int defaultInterval;
+ protected int maxInterval;
+
+ public AbstractFetchSchedule() {
+ super(null);
+ }
+
+ public AbstractFetchSchedule(Configuration conf) {
+ super(conf);
+ }
+
+ public void setConf(Configuration conf) {
+ super.setConf(conf);
+ if (conf == null)
+ return;
+ defaultInterval = conf.getInt("db.fetch.interval.default", 0);
+ maxInterval = conf.getInt("db.fetch.interval.max", 0);
+ LOG.info("defaultInterval=" + defaultInterval);
+ LOG.info("maxInterval=" + maxInterval);
+ }
+
+ /**
+ * Initialize fetch schedule related data. Implementations should at least set
+ * the <code>fetchTime</code> and <code>fetchInterval</code>. The default
+ * implementation sets the <code>fetchTime</code> to now, using the default
+ * <code>fetchInterval</code>.
+ *
+ * @param url
+ * URL of the page.
+ *
+ * @param datum
+ * datum instance to be initialized (modified in place).
+ */
+ public CrawlDatum initializeSchedule(Text url, CrawlDatum datum) {
+ datum.setFetchTime(System.currentTimeMillis());
+ datum.setFetchInterval(defaultInterval);
+ datum.setRetriesSinceFetch(0);
+ return datum;
+ }
+
+ /**
+ * Sets the <code>fetchInterval</code> and <code>fetchTime</code> on a
+ * successfully fetched page. NOTE: this implementation resets the retry
+ * counter - extending classes should call super.setFetchSchedule() to
+ * preserve this behavior.
+ */
+ public CrawlDatum setFetchSchedule(Text url, CrawlDatum datum,
+ long prevFetchTime, long prevModifiedTime, long fetchTime,
+ long modifiedTime, int state) {
+ datum.setRetriesSinceFetch(0);
+ return datum;
+ }
+
+ /**
+ * This method specifies how to schedule refetching of pages marked as GONE.
+ * Default implementation increases fetchInterval by 50% but the value may
+ * never exceed <code>maxInterval</code>.
+ *
+ * @param url
+ * URL of the page.
+ *
+ * @param datum
+ * datum instance to be adjusted.
+ *
+ * @return adjusted page information, including all original information.
+ * NOTE: this may be a different instance than @see CrawlDatum, but
+ * implementations should make sure that it contains at least all
+ * information from @see CrawlDatum.
+ */
+ public CrawlDatum setPageGoneSchedule(Text url, CrawlDatum datum,
+ long prevFetchTime, long prevModifiedTime, long fetchTime) {
+ // no page is truly GONE ... just increase the interval by 50%
+ // and try much later.
+ if ((datum.getFetchInterval() * 1.5f) < maxInterval)
+ datum.setFetchInterval(datum.getFetchInterval() * 1.5f);
+ else
+ datum.setFetchInterval(maxInterval * 0.9f);
+ datum.setFetchTime(fetchTime + (long) datum.getFetchInterval() * 1000);
+ return datum;
+ }
+
+ /**
+ * This method adjusts the fetch schedule if fetching needs to be re-tried due
+ * to transient errors. The default implementation sets the next fetch time 1
+ * day in the future and increases the retry counter.
+ *
+ * @param url
+ * URL of the page.
+ *
+ * @param datum
+ * page information.
+ *
+ * @param prevFetchTime
+ * previous fetch time.
+ *
+ * @param prevModifiedTime
+ * previous modified time.
+ *
+ * @param fetchTime
+ * current fetch time.
+ *
+ * @return adjusted page information, including all original information.
+ * NOTE: this may be a different instance than @see CrawlDatum, but
+ * implementations should make sure that it contains at least all
+ * information from @see CrawlDatum.
+ */
+ public CrawlDatum setPageRetrySchedule(Text url, CrawlDatum datum,
+ long prevFetchTime, long prevModifiedTime, long fetchTime) {
+ datum.setFetchTime(fetchTime + (long) SECONDS_PER_DAY * 1000);
+ datum.setRetriesSinceFetch(datum.getRetriesSinceFetch() + 1);
+ return datum;
+ }
+
+ /**
+ * This method return the last fetch time of the CrawlDatum
+ *
+ * @return the date as a long.
+ */
+ public long calculateLastFetchTime(CrawlDatum datum) {
+ return datum.getFetchTime() - (long) datum.getFetchInterval() * 1000;
+ }
+
+ /**
+ * This method provides information whether the page is suitable for selection
+ * in the current fetchlist. NOTE: a true return value does not guarantee that
+ * the page will be fetched, it just allows it to be included in the further
+ * selection process based on scores. The default implementation checks
+ * <code>fetchTime</code>, if it is higher than the <code>curTime</code> it
+ * returns false, and true otherwise. It will also check that fetchTime is not
+ * too remote (more than <code>maxInterval</code>, in which case it lowers the
+ * interval and returns true.
+ *
+ * @param url
+ * URL of the page.
+ *
+ * @param datum
+ * datum instance.
+ *
+ * @param curTime
+ * reference time (usually set to the time when the fetchlist
+ * generation process was started).
+ *
+ * @return true, if the page should be considered for inclusion in the current
+ * fetchlist, otherwise false.
+ */
+ public boolean shouldFetch(Text url, CrawlDatum datum, long curTime) {
+ // pages are never truly GONE - we have to check them from time to time.
+ // pages with too long fetchInterval are adjusted so that they fit within
+ // maximum fetchInterval (segment retention period).
+ if (datum.getFetchTime() - curTime > (long) maxInterval * 1000) {
+ if (datum.getFetchInterval() > maxInterval) {
+ datum.setFetchInterval(maxInterval * 0.9f);
+ }
+ datum.setFetchTime(curTime);
+ }
+ if (datum.getFetchTime() > curTime) {
+ return false; // not time yet
+ }
+ return true;
+ }
+
+ /**
+ * This method resets fetchTime, fetchInterval, modifiedTime,
+ * retriesSinceFetch and page signature, so that it forces refetching.
+ *
+ * @param url
+ * URL of the page.
+ *
+ * @param datum
+ * datum instance.
+ *
+ * @param asap
+ * if true, force refetch as soon as possible - this sets the
+ * fetchTime to now. If false, force refetch whenever the next fetch
+ * time is set.
+ */
+ public CrawlDatum forceRefetch(Text url, CrawlDatum datum, boolean asap) {
+ // reduce fetchInterval so that it fits within the max value
+ if (datum.getFetchInterval() > maxInterval)
+ datum.setFetchInterval(maxInterval * 0.9f);
+ datum.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
+ datum.setRetriesSinceFetch(0);
+ datum.setSignature(null);
+ datum.setModifiedTime(0L);
+ if (asap)
+ datum.setFetchTime(System.currentTimeMillis());
+ return datum;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java b/nutch-core/src/main/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
new file mode 100755
index 0000000..08cad34
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
@@ -0,0 +1,203 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.FloatWritable;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.util.NutchConfiguration;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This class implements an adaptive re-fetch algorithm. This works as follows:
+ * <ul>
+ * <li>for pages that has changed since the last fetchTime, decrease their
+ * fetchInterval by a factor of DEC_FACTOR (default value is 0.2f).</li>
+ * <li>for pages that haven't changed since the last fetchTime, increase their
+ * fetchInterval by a factor of INC_FACTOR (default value is 0.2f).<br>
+ * If SYNC_DELTA property is true, then:
+ * <ul>
+ * <li>calculate a <code>delta = fetchTime - modifiedTime</code></li>
+ * <li>try to synchronize with the time of change, by shifting the next
+ * fetchTime by a fraction of the difference between the last modification time
+ * and the last fetch time. I.e. the next fetch time will be set to
+ * <code>fetchTime + fetchInterval - delta * SYNC_DELTA_RATE</code></li>
+ * <li>if the adjusted fetch interval is bigger than the delta, then
+ * <code>fetchInterval = delta</code>.</li>
+ * </ul>
+ * </li>
+ * <li>the minimum value of fetchInterval may not be smaller than MIN_INTERVAL
+ * (default is 1 minute).</li>
+ * <li>the maximum value of fetchInterval may not be bigger than MAX_INTERVAL
+ * (default is 365 days).</li>
+ * </ul>
+ * <p>
+ * NOTE: values of DEC_FACTOR and INC_FACTOR higher than 0.4f may destabilize
+ * the algorithm, so that the fetch interval either increases or decreases
+ * infinitely, with little relevance to the page changes. Please use
+ * {@link #main(String[])} method to test the values before applying them in a
+ * production system.
+ * </p>
+ *
+ * @author Andrzej Bialecki
+ */
+public class AdaptiveFetchSchedule extends AbstractFetchSchedule {
+
+ // Loggg
+ public static final Logger LOG = LoggerFactory
+ .getLogger(AbstractFetchSchedule.class);
+
+ protected float INC_RATE;
+
+ protected float DEC_RATE;
+
+ private float MAX_INTERVAL;
+
+ private float MIN_INTERVAL;
+
+ private boolean SYNC_DELTA;
+
+ private double SYNC_DELTA_RATE;
+
+ public void setConf(Configuration conf) {
+ super.setConf(conf);
+ if (conf == null)
+ return;
+ INC_RATE = conf.getFloat("db.fetch.schedule.adaptive.inc_rate", 0.2f);
+ DEC_RATE = conf.getFloat("db.fetch.schedule.adaptive.dec_rate", 0.2f);
+ MIN_INTERVAL = conf.getFloat("db.fetch.schedule.adaptive.min_interval", (float) 60.0);
+ MAX_INTERVAL = conf.getFloat("db.fetch.schedule.adaptive.max_interval",
+ (float) SECONDS_PER_DAY * 365); // 1 year
+ SYNC_DELTA = conf.getBoolean("db.fetch.schedule.adaptive.sync_delta", true);
+ SYNC_DELTA_RATE = conf.getFloat(
+ "db.fetch.schedule.adaptive.sync_delta_rate", 0.2f);
+ }
+
+ @Override
+ public CrawlDatum setFetchSchedule(Text url, CrawlDatum datum,
+ long prevFetchTime, long prevModifiedTime, long fetchTime,
+ long modifiedTime, int state) {
+ super.setFetchSchedule(url, datum, prevFetchTime, prevModifiedTime,
+ fetchTime, modifiedTime, state);
+
+ float interval = datum.getFetchInterval();
+ long refTime = fetchTime;
+
+ // https://issues.apache.org/jira/browse/NUTCH-1430
+ interval = (interval == 0) ? defaultInterval : interval;
+
+ if (datum.getMetaData().containsKey(Nutch.WRITABLE_FIXED_INTERVAL_KEY)) {
+ // Is fetch interval preset in CrawlDatum MD? Then use preset interval
+ FloatWritable customIntervalWritable = (FloatWritable) (datum
+ .getMetaData().get(Nutch.WRITABLE_FIXED_INTERVAL_KEY));
+ interval = customIntervalWritable.get();
+ } else {
+ if (modifiedTime <= 0)
+ modifiedTime = fetchTime;
+ switch (state) {
+ case FetchSchedule.STATUS_MODIFIED:
+ interval *= (1.0f - DEC_RATE);
+ break;
+ case FetchSchedule.STATUS_NOTMODIFIED:
+ interval *= (1.0f + INC_RATE);
+ break;
+ case FetchSchedule.STATUS_UNKNOWN:
+ break;
+ }
+ if (SYNC_DELTA) {
+ // try to synchronize with the time of change
+ long delta = (fetchTime - modifiedTime) / 1000L;
+ if (delta > interval)
+ interval = delta;
+ refTime = fetchTime - Math.round(delta * SYNC_DELTA_RATE * 1000);
+ }
+ if (interval < MIN_INTERVAL) {
+ interval = MIN_INTERVAL;
+ } else if (interval > MAX_INTERVAL) {
+ interval = MAX_INTERVAL;
+ }
+ }
+
+ datum.setFetchInterval(interval);
+ datum.setFetchTime(refTime + Math.round(interval * 1000.0));
+ datum.setModifiedTime(modifiedTime);
+ return datum;
+ }
+
+ public static void main(String[] args) throws Exception {
+ FetchSchedule fs = new AdaptiveFetchSchedule();
+ fs.setConf(NutchConfiguration.create());
+ // we start the time at 0, for simplicity
+ long curTime = 0;
+ long delta = 1000L * 3600L * 24L; // 2 hours
+ // we trigger the update of the page every 30 days
+ long update = 1000L * 3600L * 24L * 30L; // 30 days
+ boolean changed = true;
+ long lastModified = 0;
+ int miss = 0;
+ int totalMiss = 0;
+ int maxMiss = 0;
+ int fetchCnt = 0;
+ int changeCnt = 0;
+ // initial fetchInterval is 10 days
+ CrawlDatum p = new CrawlDatum(1, 3600 * 24 * 30, 1.0f);
+ p.setFetchTime(0);
+ LOG.info(p.toString());
+ // let's move the timeline a couple of deltas
+ for (int i = 0; i < 10000; i++) {
+ if (lastModified + update < curTime) {
+ // System.out.println("i=" + i + ", lastModified=" + lastModified +
+ // ", update=" + update + ", curTime=" + curTime);
+ changed = true;
+ changeCnt++;
+ lastModified = curTime;
+ }
+ LOG.info(i + ". " + changed + "\twill fetch at "
+ + (p.getFetchTime() / delta) + "\tinterval "
+ + (p.getFetchInterval() / SECONDS_PER_DAY) + " days" + "\t missed "
+ + miss);
+ if (p.getFetchTime() <= curTime) {
+ fetchCnt++;
+ fs.setFetchSchedule(new Text("http://www.example.com"), p, p
+ .getFetchTime(), p.getModifiedTime(), curTime, lastModified,
+ changed ? FetchSchedule.STATUS_MODIFIED
+ : FetchSchedule.STATUS_NOTMODIFIED);
+ LOG.info("\tfetched & adjusted: " + "\twill fetch at "
+ + (p.getFetchTime() / delta) + "\tinterval "
+ + (p.getFetchInterval() / SECONDS_PER_DAY) + " days");
+ if (!changed)
+ miss++;
+ if (miss > maxMiss)
+ maxMiss = miss;
+ changed = false;
+ totalMiss += miss;
+ miss = 0;
+ }
+ if (changed)
+ miss++;
+ curTime += delta;
+ }
+ LOG.info("Total missed: " + totalMiss + ", max miss: " + maxMiss);
+ LOG.info("Page changed " + changeCnt + " times, fetched " + fetchCnt
+ + " times.");
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/crawl/CrawlDatum.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/crawl/CrawlDatum.java b/nutch-core/src/main/java/org/apache/nutch/crawl/CrawlDatum.java
new file mode 100644
index 0000000..7fe3e1e
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/crawl/CrawlDatum.java
@@ -0,0 +1,572 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import java.io.*;
+import java.util.*;
+import java.util.Map.Entry;
+
+import org.apache.commons.jexl2.JexlContext;
+import org.apache.commons.jexl2.Expression;
+import org.apache.commons.jexl2.JexlEngine;
+import org.apache.commons.jexl2.MapContext;
+
+import org.apache.hadoop.io.*;
+import org.apache.nutch.util.*;
+
+/* The crawl state of a url. */
+public class CrawlDatum implements WritableComparable<CrawlDatum>, Cloneable {
+ public static final String GENERATE_DIR_NAME = "crawl_generate";
+ public static final String FETCH_DIR_NAME = "crawl_fetch";
+ public static final String PARSE_DIR_NAME = "crawl_parse";
+
+ private final static byte CUR_VERSION = 7;
+
+ /** Compatibility values for on-the-fly conversion from versions < 5. */
+ private static final byte OLD_STATUS_SIGNATURE = 0;
+ private static final byte OLD_STATUS_DB_UNFETCHED = 1;
+ private static final byte OLD_STATUS_DB_FETCHED = 2;
+ private static final byte OLD_STATUS_DB_GONE = 3;
+ private static final byte OLD_STATUS_LINKED = 4;
+ private static final byte OLD_STATUS_FETCH_SUCCESS = 5;
+ private static final byte OLD_STATUS_FETCH_RETRY = 6;
+ private static final byte OLD_STATUS_FETCH_GONE = 7;
+
+ private static HashMap<Byte, Byte> oldToNew = new HashMap<Byte, Byte>();
+
+ /** Page was not fetched yet. */
+ public static final byte STATUS_DB_UNFETCHED = 0x01;
+ /** Page was successfully fetched. */
+ public static final byte STATUS_DB_FETCHED = 0x02;
+ /** Page no longer exists. */
+ public static final byte STATUS_DB_GONE = 0x03;
+ /** Page temporarily redirects to other page. */
+ public static final byte STATUS_DB_REDIR_TEMP = 0x04;
+ /** Page permanently redirects to other page. */
+ public static final byte STATUS_DB_REDIR_PERM = 0x05;
+ /** Page was successfully fetched and found not modified. */
+ public static final byte STATUS_DB_NOTMODIFIED = 0x06;
+ public static final byte STATUS_DB_DUPLICATE = 0x07;
+
+ /** Maximum value of DB-related status. */
+ public static final byte STATUS_DB_MAX = 0x1f;
+
+ /** Fetching was successful. */
+ public static final byte STATUS_FETCH_SUCCESS = 0x21;
+ /** Fetching unsuccessful, needs to be retried (transient errors). */
+ public static final byte STATUS_FETCH_RETRY = 0x22;
+ /** Fetching temporarily redirected to other page. */
+ public static final byte STATUS_FETCH_REDIR_TEMP = 0x23;
+ /** Fetching permanently redirected to other page. */
+ public static final byte STATUS_FETCH_REDIR_PERM = 0x24;
+ /** Fetching unsuccessful - page is gone. */
+ public static final byte STATUS_FETCH_GONE = 0x25;
+ /** Fetching successful - page is not modified. */
+ public static final byte STATUS_FETCH_NOTMODIFIED = 0x26;
+
+ /** Maximum value of fetch-related status. */
+ public static final byte STATUS_FETCH_MAX = 0x3f;
+
+ /** Page signature. */
+ public static final byte STATUS_SIGNATURE = 0x41;
+ /** Page was newly injected. */
+ public static final byte STATUS_INJECTED = 0x42;
+ /** Page discovered through a link. */
+ public static final byte STATUS_LINKED = 0x43;
+ /** Page got metadata from a parser */
+ public static final byte STATUS_PARSE_META = 0x44;
+
+ public static final HashMap<Byte, String> statNames = new HashMap<Byte, String>();
+ static {
+ statNames.put(STATUS_DB_UNFETCHED, "db_unfetched");
+ statNames.put(STATUS_DB_FETCHED, "db_fetched");
+ statNames.put(STATUS_DB_GONE, "db_gone");
+ statNames.put(STATUS_DB_REDIR_TEMP, "db_redir_temp");
+ statNames.put(STATUS_DB_REDIR_PERM, "db_redir_perm");
+ statNames.put(STATUS_DB_NOTMODIFIED, "db_notmodified");
+ statNames.put(STATUS_DB_DUPLICATE, "db_duplicate");
+ statNames.put(STATUS_SIGNATURE, "signature");
+ statNames.put(STATUS_INJECTED, "injected");
+ statNames.put(STATUS_LINKED, "linked");
+ statNames.put(STATUS_FETCH_SUCCESS, "fetch_success");
+ statNames.put(STATUS_FETCH_RETRY, "fetch_retry");
+ statNames.put(STATUS_FETCH_REDIR_TEMP, "fetch_redir_temp");
+ statNames.put(STATUS_FETCH_REDIR_PERM, "fetch_redir_perm");
+ statNames.put(STATUS_FETCH_GONE, "fetch_gone");
+ statNames.put(STATUS_FETCH_NOTMODIFIED, "fetch_notmodified");
+ statNames.put(STATUS_PARSE_META, "parse_metadata");
+
+ oldToNew.put(OLD_STATUS_DB_UNFETCHED, STATUS_DB_UNFETCHED);
+ oldToNew.put(OLD_STATUS_DB_FETCHED, STATUS_DB_FETCHED);
+ oldToNew.put(OLD_STATUS_DB_GONE, STATUS_DB_GONE);
+ oldToNew.put(OLD_STATUS_FETCH_GONE, STATUS_FETCH_GONE);
+ oldToNew.put(OLD_STATUS_FETCH_SUCCESS, STATUS_FETCH_SUCCESS);
+ oldToNew.put(OLD_STATUS_FETCH_RETRY, STATUS_FETCH_RETRY);
+ oldToNew.put(OLD_STATUS_LINKED, STATUS_LINKED);
+ oldToNew.put(OLD_STATUS_SIGNATURE, STATUS_SIGNATURE);
+ }
+
+ private byte status;
+ private long fetchTime = System.currentTimeMillis();
+ private byte retries;
+ private int fetchInterval;
+ private float score = 0.0f;
+ private byte[] signature = null;
+ private long modifiedTime;
+ private org.apache.hadoop.io.MapWritable metaData;
+
+ public static boolean hasDbStatus(CrawlDatum datum) {
+ if (datum.status <= STATUS_DB_MAX)
+ return true;
+ return false;
+ }
+
+ public static boolean hasFetchStatus(CrawlDatum datum) {
+ if (datum.status > STATUS_DB_MAX && datum.status <= STATUS_FETCH_MAX)
+ return true;
+ return false;
+ }
+
+ public CrawlDatum() {
+ }
+
+ public CrawlDatum(int status, int fetchInterval) {
+ this();
+ this.status = (byte) status;
+ this.fetchInterval = fetchInterval;
+ }
+
+ public CrawlDatum(int status, int fetchInterval, float score) {
+ this(status, fetchInterval);
+ this.score = score;
+ }
+
+ //
+ // accessor methods
+ //
+
+ public byte getStatus() {
+ return status;
+ }
+
+ public static String getStatusName(byte value) {
+ String res = statNames.get(value);
+ if (res == null)
+ res = "unknown";
+ return res;
+ }
+
+ public void setStatus(int status) {
+ this.status = (byte) status;
+ }
+
+ /**
+ * Returns either the time of the last fetch, or the next fetch time,
+ * depending on whether Fetcher or CrawlDbReducer set the time.
+ */
+ public long getFetchTime() {
+ return fetchTime;
+ }
+
+ /**
+ * Sets either the time of the last fetch or the next fetch time, depending on
+ * whether Fetcher or CrawlDbReducer set the time.
+ */
+ public void setFetchTime(long fetchTime) {
+ this.fetchTime = fetchTime;
+ }
+
+ public long getModifiedTime() {
+ return modifiedTime;
+ }
+
+ public void setModifiedTime(long modifiedTime) {
+ this.modifiedTime = modifiedTime;
+ }
+
+ public byte getRetriesSinceFetch() {
+ return retries;
+ }
+
+ public void setRetriesSinceFetch(int retries) {
+ this.retries = (byte) retries;
+ }
+
+ public int getFetchInterval() {
+ return fetchInterval;
+ }
+
+ public void setFetchInterval(int fetchInterval) {
+ this.fetchInterval = fetchInterval;
+ }
+
+ public void setFetchInterval(float fetchInterval) {
+ this.fetchInterval = Math.round(fetchInterval);
+ }
+
+ public float getScore() {
+ return score;
+ }
+
+ public void setScore(float score) {
+ this.score = score;
+ }
+
+ public byte[] getSignature() {
+ return signature;
+ }
+
+ public void setSignature(byte[] signature) {
+ if (signature != null && signature.length > 256)
+ throw new RuntimeException("Max signature length (256) exceeded: "
+ + signature.length);
+ this.signature = signature;
+ }
+
+ public void setMetaData(org.apache.hadoop.io.MapWritable mapWritable) {
+ this.metaData = new org.apache.hadoop.io.MapWritable(mapWritable);
+ }
+
+ /**
+ * Add all metadata from other CrawlDatum to this CrawlDatum.
+ *
+ * @param other
+ * CrawlDatum
+ */
+ public void putAllMetaData(CrawlDatum other) {
+ for (Entry<Writable, Writable> e : other.getMetaData().entrySet()) {
+ getMetaData().put(e.getKey(), e.getValue());
+ }
+ }
+
+ /**
+ * returns a MapWritable if it was set or read in @see readFields(DataInput),
+ * returns empty map in case CrawlDatum was freshly created (lazily
+ * instantiated).
+ */
+ public org.apache.hadoop.io.MapWritable getMetaData() {
+ if (this.metaData == null)
+ this.metaData = new org.apache.hadoop.io.MapWritable();
+ return this.metaData;
+ }
+
+ //
+ // writable methods
+ //
+
+ public static CrawlDatum read(DataInput in) throws IOException {
+ CrawlDatum result = new CrawlDatum();
+ result.readFields(in);
+ return result;
+ }
+
+ public void readFields(DataInput in) throws IOException {
+ byte version = in.readByte(); // read version
+ if (version > CUR_VERSION) // check version
+ throw new VersionMismatchException(CUR_VERSION, version);
+
+ status = in.readByte();
+ fetchTime = in.readLong();
+ retries = in.readByte();
+ if (version > 5) {
+ fetchInterval = in.readInt();
+ } else
+ fetchInterval = Math.round(in.readFloat());
+ score = in.readFloat();
+ if (version > 2) {
+ modifiedTime = in.readLong();
+ int cnt = in.readByte();
+ if (cnt > 0) {
+ signature = new byte[cnt];
+ in.readFully(signature);
+ } else
+ signature = null;
+ }
+
+ if (version > 3) {
+ boolean hasMetadata = false;
+ if (version < 7) {
+ org.apache.hadoop.io.MapWritable oldMetaData = new org.apache.hadoop.io.MapWritable();
+ if (in.readBoolean()) {
+ hasMetadata = true;
+ metaData = new org.apache.hadoop.io.MapWritable();
+ oldMetaData.readFields(in);
+ }
+ for (Writable key : oldMetaData.keySet()) {
+ metaData.put(key, oldMetaData.get(key));
+ }
+ } else {
+ if (in.readBoolean()) {
+ hasMetadata = true;
+ metaData = new org.apache.hadoop.io.MapWritable();
+ metaData.readFields(in);
+ }
+ }
+ if (hasMetadata == false)
+ metaData = null;
+ }
+ // translate status codes
+ if (version < 5) {
+ if (oldToNew.containsKey(status))
+ status = oldToNew.get(status);
+ else
+ status = STATUS_DB_UNFETCHED;
+
+ }
+ }
+
+ /** The number of bytes into a CrawlDatum that the score is stored. */
+ private static final int SCORE_OFFSET = 1 + 1 + 8 + 1 + 4;
+ private static final int SIG_OFFSET = SCORE_OFFSET + 4 + 8;
+
+ public void write(DataOutput out) throws IOException {
+ out.writeByte(CUR_VERSION); // store current version
+ out.writeByte(status);
+ out.writeLong(fetchTime);
+ out.writeByte(retries);
+ out.writeInt(fetchInterval);
+ out.writeFloat(score);
+ out.writeLong(modifiedTime);
+ if (signature == null) {
+ out.writeByte(0);
+ } else {
+ out.writeByte(signature.length);
+ out.write(signature);
+ }
+ if (metaData != null && metaData.size() > 0) {
+ out.writeBoolean(true);
+ metaData.write(out);
+ } else {
+ out.writeBoolean(false);
+ }
+ }
+
+ /** Copy the contents of another instance into this instance. */
+ public void set(CrawlDatum that) {
+ this.status = that.status;
+ this.fetchTime = that.fetchTime;
+ this.retries = that.retries;
+ this.fetchInterval = that.fetchInterval;
+ this.score = that.score;
+ this.modifiedTime = that.modifiedTime;
+ this.signature = that.signature;
+ if (that.metaData != null) {
+ this.metaData = new org.apache.hadoop.io.MapWritable(that.metaData); // make
+ // a
+ // deep
+ // copy
+ } else {
+ this.metaData = null;
+ }
+ }
+
+ //
+ // compare methods
+ //
+
+ /** Sort by decreasing score. */
+ public int compareTo(CrawlDatum that) {
+ if (that.score != this.score)
+ return (that.score - this.score) > 0 ? 1 : -1;
+ if (that.status != this.status)
+ return this.status - that.status;
+ if (that.fetchTime != this.fetchTime)
+ return (that.fetchTime - this.fetchTime) > 0 ? 1 : -1;
+ if (that.retries != this.retries)
+ return that.retries - this.retries;
+ if (that.fetchInterval != this.fetchInterval)
+ return (that.fetchInterval - this.fetchInterval) > 0 ? 1 : -1;
+ if (that.modifiedTime != this.modifiedTime)
+ return (that.modifiedTime - this.modifiedTime) > 0 ? 1 : -1;
+ return SignatureComparator._compare(this, that);
+ }
+
+ /** A Comparator optimized for CrawlDatum. */
+ public static class Comparator extends WritableComparator {
+ public Comparator() {
+ super(CrawlDatum.class);
+ }
+
+ public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
+ float score1 = readFloat(b1, s1 + SCORE_OFFSET);
+ float score2 = readFloat(b2, s2 + SCORE_OFFSET);
+ if (score2 != score1) {
+ return (score2 - score1) > 0 ? 1 : -1;
+ }
+ int status1 = b1[s1 + 1];
+ int status2 = b2[s2 + 1];
+ if (status2 != status1)
+ return status1 - status2;
+ long fetchTime1 = readLong(b1, s1 + 1 + 1);
+ long fetchTime2 = readLong(b2, s2 + 1 + 1);
+ if (fetchTime2 != fetchTime1)
+ return (fetchTime2 - fetchTime1) > 0 ? 1 : -1;
+ int retries1 = b1[s1 + 1 + 1 + 8];
+ int retries2 = b2[s2 + 1 + 1 + 8];
+ if (retries2 != retries1)
+ return retries2 - retries1;
+ int fetchInterval1 = readInt(b1, s1 + 1 + 1 + 8 + 1);
+ int fetchInterval2 = readInt(b2, s2 + 1 + 1 + 8 + 1);
+ if (fetchInterval2 != fetchInterval1)
+ return (fetchInterval2 - fetchInterval1) > 0 ? 1 : -1;
+ long modifiedTime1 = readLong(b1, s1 + SCORE_OFFSET + 4);
+ long modifiedTime2 = readLong(b2, s2 + SCORE_OFFSET + 4);
+ if (modifiedTime2 != modifiedTime1)
+ return (modifiedTime2 - modifiedTime1) > 0 ? 1 : -1;
+ int sigl1 = b1[s1 + SIG_OFFSET];
+ int sigl2 = b2[s2 + SIG_OFFSET];
+ return SignatureComparator._compare(b1, SIG_OFFSET, sigl1, b2,
+ SIG_OFFSET, sigl2);
+ }
+ }
+
+ static { // register this comparator
+ WritableComparator.define(CrawlDatum.class, new Comparator());
+ }
+
+ //
+ // basic methods
+ //
+
+ public String toString() {
+ StringBuilder buf = new StringBuilder();
+ buf.append("Version: " + CUR_VERSION + "\n");
+ buf.append("Status: " + getStatus() + " (" + getStatusName(getStatus())
+ + ")\n");
+ buf.append("Fetch time: " + new Date(getFetchTime()) + "\n");
+ buf.append("Modified time: " + new Date(getModifiedTime()) + "\n");
+ buf.append("Retries since fetch: " + getRetriesSinceFetch() + "\n");
+ buf.append("Retry interval: " + getFetchInterval() + " seconds ("
+ + (getFetchInterval() / FetchSchedule.SECONDS_PER_DAY) + " days)\n");
+ buf.append("Score: " + getScore() + "\n");
+ buf.append("Signature: " + StringUtil.toHexString(getSignature()) + "\n");
+ buf.append("Metadata: \n ");
+ if (metaData != null) {
+ for (Entry<Writable, Writable> e : metaData.entrySet()) {
+ buf.append("\t");
+ buf.append(e.getKey());
+ buf.append("=");
+ buf.append(e.getValue());
+ buf.append("\n");
+ }
+ }
+ return buf.toString();
+ }
+
+ private boolean metadataEquals(org.apache.hadoop.io.MapWritable otherMetaData) {
+ if (metaData == null || metaData.size() == 0) {
+ return otherMetaData == null || otherMetaData.size() == 0;
+ }
+ if (otherMetaData == null) {
+ // we already know that the current object is not null or empty
+ return false;
+ }
+ HashSet<Entry<Writable, Writable>> set1 = new HashSet<Entry<Writable, Writable>>(
+ metaData.entrySet());
+ HashSet<Entry<Writable, Writable>> set2 = new HashSet<Entry<Writable, Writable>>(
+ otherMetaData.entrySet());
+ return set1.equals(set2);
+ }
+
+ public boolean equals(Object o) {
+ if (!(o instanceof CrawlDatum))
+ return false;
+ CrawlDatum other = (CrawlDatum) o;
+ boolean res = (this.status == other.status)
+ && (this.fetchTime == other.fetchTime)
+ && (this.modifiedTime == other.modifiedTime)
+ && (this.retries == other.retries)
+ && (this.fetchInterval == other.fetchInterval)
+ && (SignatureComparator._compare(this.signature, other.signature) == 0)
+ && (this.score == other.score);
+ if (!res)
+ return res;
+ return metadataEquals(other.metaData);
+ }
+
+ public int hashCode() {
+ int res = 0;
+ if (signature != null) {
+ for (int i = 0; i < signature.length / 4; i += 4) {
+ res ^= (signature[i] << 24 + signature[i + 1] << 16 + signature[i + 2] << 8 + signature[i + 3]);
+ }
+ }
+ if (metaData != null) {
+ res ^= metaData.entrySet().hashCode();
+ }
+ return res ^ status ^ ((int) fetchTime) ^ ((int) modifiedTime) ^ retries
+ ^ fetchInterval ^ Float.floatToIntBits(score);
+ }
+
+ public Object clone() {
+ try {
+ return super.clone();
+ } catch (CloneNotSupportedException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ public boolean evaluate(Expression expr) {
+ if (expr != null) {
+ // Create a context and add data
+ JexlContext jcontext = new MapContext();
+
+ // https://issues.apache.org/jira/browse/NUTCH-2229
+ jcontext.set("status", getStatusName(getStatus()));
+ jcontext.set("fetchTime", (long)(getFetchTime()));
+ jcontext.set("modifiedTime", (long)(getModifiedTime()));
+ jcontext.set("retries", getRetriesSinceFetch());
+ jcontext.set("interval", new Integer(getFetchInterval()));
+ jcontext.set("score", getScore());
+ jcontext.set("signature", StringUtil.toHexString(getSignature()));
+
+ // Set metadata variables
+ for (Map.Entry<Writable, Writable> entry : getMetaData().entrySet()) {
+ Object value = entry.getValue();
+
+ if (value instanceof FloatWritable) {
+ FloatWritable fvalue = (FloatWritable)value;
+ Text tkey = (Text)entry.getKey();
+ jcontext.set(tkey.toString(), fvalue.get());
+ }
+
+ if (value instanceof IntWritable) {
+ IntWritable ivalue = (IntWritable)value;
+ Text tkey = (Text)entry.getKey();
+ jcontext.set(tkey.toString(), ivalue.get());
+ }
+
+ if (value instanceof Text) {
+ Text tvalue = (Text)value;
+ Text tkey = (Text)entry.getKey();
+ jcontext.set(tkey.toString().replace("-", "_"), tvalue.toString());
+ }
+ }
+
+ try {
+ if (Boolean.TRUE.equals(expr.evaluate(jcontext))) {
+ return true;
+ }
+ } catch (Exception e) {
+ //
+ }
+ }
+
+ return false;
+ }
+}
\ No newline at end of file