You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by th...@apache.org on 2016/07/11 02:07:28 UTC
[1/2] nutch git commit: Reproduced runtime/local build without
breaking the backward compatibility
Repository: nutch
Updated Branches:
refs/heads/NUTCH-2292 9173fd4d6 -> 9f3ba3eda
Reproduced runtime/local build without breaking the backward compatibility
Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/020f581a
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/020f581a
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/020f581a
Branch: refs/heads/NUTCH-2292
Commit: 020f581a2cc735f26d6a423e87da7f7462ed3a35
Parents: 9173fd4
Author: Thamme Gowda <th...@apache.org>
Authored: Sun Jul 10 17:49:51 2016 -0700
Committer: Thamme Gowda <th...@apache.org>
Committed: Sun Jul 10 17:49:51 2016 -0700
----------------------------------------------------------------------
bin/crawl | 281 +++++++++++++++++++++++++++++++++++++++
bin/nutch | 324 +++++++++++++++++++++++++++++++++++++++++++++
nutch-core/pom.xml | 30 ++++-
nutch-plugins/pom.xml | 37 +++++-
pom.xml | 51 ++++++-
src/bin/crawl | 281 ---------------------------------------
src/bin/nutch | 324 ---------------------------------------------
7 files changed, 717 insertions(+), 611 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/nutch/blob/020f581a/bin/crawl
----------------------------------------------------------------------
diff --git a/bin/crawl b/bin/crawl
new file mode 100755
index 0000000..567d35e
--- /dev/null
+++ b/bin/crawl
@@ -0,0 +1,281 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Usage: crawl [-i|--index] [-D "key=value"] [-w|--wait] <Seed Dir> <Crawl Dir> <Num Rounds>
+# -i|--index Indexes crawl results into a configured indexer
+# -w|--wait NUMBER[SUFFIX] Time to wait before generating a new segment when no URLs
+# are scheduled for fetching. Suffix can be: s for second,
+# m for minute, h for hour and d for day. If no suffix is
+# specified second is used by default.
+# -D A Java property to pass to Nutch calls
+# Seed Dir Directory in which to look for a seeds file
+# Crawl Dir Directory where the crawl/link/segments dirs are saved
+# Num Rounds The number of rounds to run this crawl for
+#
+#
+# UNLIKE THE NUTCH ALL-IN-ONE-CRAWL COMMAND THIS SCRIPT DOES THE LINK INVERSION AND
+# INDEXING FOR EACH SEGMENT
+
+INDEXFLAG=false
+JAVA_PROPERTIES=""
+WAIT=-1 # don't wait if there are no URLs to fetch
+
+function __to_seconds() {
+ NUMBER=$(echo $1 | tr -dc '0-9')
+ MODIFIER=$(echo $1 | tr -dc '[^s|h|m|d]]')
+
+ case $MODIFIER in
+ m|M)
+ SECONDS=`expr $NUMBER \* 60`
+ ;;
+ h|H)
+ SECONDS=`expr $NUMBER \* 120`
+ ;;
+ d|D)
+ SECONDS=`expr $NUMBER \* 86400`
+ ;;
+ s|S|*)
+ SECONDS=$NUMBER
+ ;;
+ esac
+
+ echo $SECONDS
+}
+
+while [[ $# > 0 ]]
+do
+ case $1 in
+ -i|--index)
+ INDEXFLAG=true
+ shift
+ ;;
+ -D)
+ JAVA_PROPERTIES="-D${2} ${JAVA_PROPERTIES}"
+ shift 2
+ ;;
+ -w|--wait)
+ WAIT="${2}"
+ shift 2
+ ;;
+ *)
+ break
+ ;;
+ esac
+done
+
+if [[ $# != 3 ]]; then
+ echo "Usage: crawl [-i|--index] [-D \"key=value\"] [-w|--wait] <Seed Dir> <Crawl Dir> <Num Rounds>"
+ echo -e "\t-i|--index\tIndexes crawl results into a configured indexer"
+ echo -e "\t-D\t\tA Java property to pass to Nutch calls"
+ echo -e "\t-w|--wait\tNUMBER[SUFFIX] Time to wait before generating a new segment when no URLs"
+ echo -e "\t\t\tare scheduled for fetching. Suffix can be: s for second,"
+ echo -e "\t\t\tm for minute, h for hour and d for day. If no suffix is"
+ echo -e "\t\t\tspecified second is used by default."
+ echo -e "\tSeed Dir\tDirectory in which to look for a seeds file"
+ echo -e "\tCrawl Dir\tDirectory where the crawl/link/segments dirs are saved"
+ echo -e "\tNum Rounds\tThe number of rounds to run this crawl for"
+ exit 1
+fi
+
+SEEDDIR="$1"
+CRAWL_PATH="$2"
+LIMIT="$3"
+
+# convert wait time to seconds for compatibility reasons
+if [ "$WAIT" != "-1" ]; then
+ WAIT=$( __to_seconds "$WAIT" )
+ echo "Time to wait (--wait) = $WAIT sec."
+fi
+
+#############################################
+# MODIFY THE PARAMETERS BELOW TO YOUR NEEDS #
+#############################################
+
+# set the number of slaves nodes
+numSlaves=1
+
+# and the total number of available tasks
+# sets Hadoop parameter "mapreduce.job.reduces"
+numTasks=`expr $numSlaves \* 2`
+
+# number of urls to fetch in one iteration
+# 250K per task?
+sizeFetchlist=`expr $numSlaves \* 50000`
+
+# time limit for feching
+timeLimitFetch=180
+
+# num threads for fetching
+numThreads=50
+
+#############################################
+
+bin="`dirname "$0"`"
+bin="`cd "$bin"; pwd`"
+
+# determines whether mode based on presence of job file
+mode=local
+if [ -f "${bin}"/../*nutch*.job ]; then
+ mode=distributed
+fi
+
+# note that some of the options listed here could be set in the
+# corresponding hadoop site xml param file
+commonOptions="-D mapreduce.job.reduces=$numTasks -D mapred.child.java.opts=-Xmx1000m -D mapreduce.reduce.speculative=false -D mapreduce.map.speculative=false -D mapreduce.map.output.compress=true"
+
+ # check that hadoop can be found on the path
+if [ $mode = "distributed" ]; then
+ if [ $(which hadoop | wc -l ) -eq 0 ]; then
+ echo "Can't find Hadoop executable. Add HADOOP_COMMON_HOME/bin to the path or run in local mode."
+ exit -1;
+ fi
+fi
+
+
+function __bin_nutch {
+ # run $bin/nutch, exit if exit value indicates error
+
+ echo "$bin/nutch $@" ;# echo command and arguments
+ "$bin/nutch" "$@"
+
+ RETCODE=$?
+ if [ $RETCODE -ne 0 ]
+ then
+ echo "Error running:"
+ echo " $bin/nutch $@"
+ echo "Failed with exit value $RETCODE."
+ exit $RETCODE
+ fi
+}
+
+
+
+# initial injection
+echo "Injecting seed URLs"
+__bin_nutch inject "$CRAWL_PATH"/crawldb "$SEEDDIR"
+
+# main loop : rounds of generate - fetch - parse - update
+for ((a=1; ; a++))
+do
+ if [ -e ".STOP" ]
+ then
+ echo "STOP file found - escaping loop"
+ break
+ fi
+
+ if [ $LIMIT -ne -1 ]; then
+ if [ $a -gt $LIMIT ]; then
+ echo `date` ": Finished loop with $LIMIT iterations"
+ break
+ fi
+ echo `date` ": Iteration $a of $LIMIT"
+ else
+ echo `date` ": Iteration $a"
+ fi
+
+ echo "Generating a new segment"
+ generate_args=($commonOptions "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments -topN $sizeFetchlist -numFetchers $numSlaves -noFilter)
+ echo "$bin/nutch generate ${generate_args[@]}"
+ $bin/nutch generate "${generate_args[@]}"
+ RETCODE=$?
+ if [ $RETCODE -eq 0 ]; then
+ : # ok: no error
+ elif [ $RETCODE -eq 1 ]; then
+ echo "Generate returned 1 (no new segments created)"
+
+ if [ "$WAIT" -ne -1 ]; then
+ echo "Waiting for $WAIT sec. ..."
+ sleep $WAIT
+ continue
+ else
+ echo "Escaping loop: no more URLs to fetch now"
+ break
+ fi
+ else
+ echo "Error running:"
+ echo " $bin/nutch generate ${generate_args[@]}"
+ echo "Failed with exit value $RETCODE."
+ exit $RETCODE
+ fi
+
+ # capture the name of the segment
+ # call hadoop in distributed mode
+ # or use ls
+
+ if [ $mode = "local" ]; then
+ SEGMENT=`ls "$CRAWL_PATH"/segments/ | sort -n | tail -n 1`
+ else
+ SEGMENT=`hadoop fs -ls "$CRAWL_PATH"/segments/ | grep segments | sed -e "s/\//\\n/g" | egrep 20[0-9]+ | sort -n | tail -n 1`
+ fi
+
+ echo "Operating on segment : $SEGMENT"
+
+ # fetching the segment
+ echo "Fetching : $SEGMENT"
+ __bin_nutch fetch $commonOptions -D fetcher.timelimit.mins=$timeLimitFetch "$CRAWL_PATH"/segments/$SEGMENT -noParsing -threads $numThreads
+
+ # parsing the segment
+ echo "Parsing : $SEGMENT"
+ # enable the skipping of records for the parsing so that a dodgy document
+ # so that it does not fail the full task
+ skipRecordsOptions="-D mapreduce.task.skip.start.attempts=2 -D mapreduce.map.skip.maxrecords=1"
+ __bin_nutch parse $commonOptions $skipRecordsOptions "$CRAWL_PATH"/segments/$SEGMENT
+
+ # updatedb with this segment
+ echo "CrawlDB update"
+ __bin_nutch updatedb $commonOptions "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments/$SEGMENT
+
+# note that the link inversion - indexing routine can be done within the main loop
+# on a per segment basis
+ echo "Link inversion"
+ __bin_nutch invertlinks "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT
+
+ echo "Dedup on crawldb"
+ __bin_nutch dedup "$CRAWL_PATH"/crawldb
+
+ if $INDEXFLAG; then
+ echo "Indexing $SEGMENT to index"
+ __bin_nutch index $JAVA_PROPERTIES "$CRAWL_PATH"/crawldb -linkdb "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT
+
+ echo "Cleaning up index if possible"
+ __bin_nutch clean $JAVA_PROPERTIES "$CRAWL_PATH"/crawldb
+ else
+ echo "Skipping indexing ..."
+ fi
+
+ #######################################################
+ # The following commands fall into WebGraph territory
+ # and should be uncommented based on your requirements
+ #######################################################
+ #echo "Building WebGraph within $CRAWL_PATH on all segments in $CRAWL_PATH/segments/"
+ #__bin_nutch webgraph $commonOptions -filter -normalize -segmentDir "$CRAWL_PATH"/segments/ -webgraphdb "$CRAWL_PATH"
+
+ #echo "Running Loops Job on WebGraph within $CRAWL_PATH"
+ #__bin_nutch org.apache.nutch.scoring.webgraph.Loops $commonOptions -webgraphdb "$CRAWL_PATH"
+
+ #echo "Running LinkRank Algorithm on WebGraph within $CRAWL_PATH"
+ #__bin_nutch linkrank $commonOptions -webgraphdb "$CRAWL_PATH"
+
+ #echo "Running ScoreUpdater Job with $CRAWL_PATH/crawldb and WebGraph within $CRAWL_PATH"
+ #__bin_nutch scoreupdater $commonOptions -crawldb "$CRAWL_PATH"/crawldb -webgraphdb "$CRAWL_PATH"
+
+ #echo "Running NodeDumper on WebGraph within $CRAWL_PATH and dumping output to $CRAWL_PATH/dump/scores"
+ #__bin_nutch nodedumper $commonOptions -scores -topn 1000 -webgraphdb "$CRAWL_PATH" -output "$CRAWL_PATH"/dump/scores
+
+done
+
+exit 0
http://git-wip-us.apache.org/repos/asf/nutch/blob/020f581a/bin/nutch
----------------------------------------------------------------------
diff --git a/bin/nutch b/bin/nutch
new file mode 100755
index 0000000..1649069
--- /dev/null
+++ b/bin/nutch
@@ -0,0 +1,324 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# The Nutch command script
+#
+# Environment Variables
+#
+# NUTCH_JAVA_HOME The java implementation to use. Overrides JAVA_HOME.
+#
+# NUTCH_HEAPSIZE The maximum amount of heap to use, in MB.
+# Default is 1000.
+#
+# NUTCH_OPTS Extra Java runtime options.
+# Multiple options must be separated by white space.
+#
+# NUTCH_LOG_DIR Log directory (default: $NUTCH_HOME/logs)
+#
+# NUTCH_LOGFILE Log file (default: hadoop.log)
+#
+# NUTCH_CONF_DIR Path(s) to configuration files (default: $NUTCH_HOME/conf).
+# Multiple paths must be separated by a colon ':'.
+#
+cygwin=false
+case "`uname`" in
+CYGWIN*) cygwin=true;;
+esac
+
+# resolve links - $0 may be a softlink
+THIS="$0"
+while [ -h "$THIS" ]; do
+ ls=`ls -ld "$THIS"`
+ link=`expr "$ls" : '.*-> \(.*\)$'`
+ if expr "$link" : '.*/.*' > /dev/null; then
+ THIS="$link"
+ else
+ THIS=`dirname "$THIS"`/"$link"
+ fi
+done
+
+# if no args specified, show usage
+if [ $# = 0 ]; then
+ echo "nutch 1.12"
+ echo "Usage: nutch COMMAND"
+ echo "where COMMAND is one of:"
+ echo " readdb read / dump crawl db"
+ echo " mergedb merge crawldb-s, with optional filtering"
+ echo " readlinkdb read / dump link db"
+ echo " inject inject new urls into the database"
+ echo " generate generate new segments to fetch from crawl db"
+ echo " freegen generate new segments to fetch from text files"
+ echo " fetch fetch a segment's pages"
+ echo " parse parse a segment's pages"
+ echo " readseg read / dump segment data"
+ echo " mergesegs merge several segments, with optional filtering and slicing"
+ echo " updatedb update crawl db from segments after fetching"
+ echo " invertlinks create a linkdb from parsed segments"
+ echo " mergelinkdb merge linkdb-s, with optional filtering"
+ echo " index run the plugin-based indexer on parsed segments and linkdb"
+ echo " dedup deduplicate entries in the crawldb and give them a special status"
+ echo " dump exports crawled data from segments into files"
+ echo " commoncrawldump exports crawled data from segments into common crawl data format encoded as CBOR"
+ echo " solrindex run the solr indexer on parsed segments and linkdb - DEPRECATED use the index command instead"
+ echo " solrdedup remove duplicates from solr - DEPRECATED use the dedup command instead"
+ echo " solrclean remove HTTP 301 and 404 documents from solr - DEPRECATED use the clean command instead"
+ echo " clean remove HTTP 301 and 404 documents and duplicates from indexing backends configured via plugins"
+ echo " parsechecker check the parser for a given url"
+ echo " indexchecker check the indexing filters for a given url"
+ echo " filterchecker check url filters for a given url"
+ echo " normalizerchecker check url normalizers for a given url"
+ echo " domainstats calculate domain statistics from crawldb"
+ echo " protocolstats calculate protocol status code stats from crawldb"
+ echo " crawlcomplete calculate crawl completion stats from crawldb"
+ echo " webgraph generate a web graph from existing segments"
+ echo " linkrank run a link analysis program on the generated web graph"
+ echo " scoreupdater updates the crawldb with linkrank scores"
+ echo " nodedumper dumps the web graph's node scores"
+ echo " plugin load a plugin and run one of its classes main()"
+ echo " junit runs the given JUnit test"
+ echo " startserver runs the Nutch Server on localhost:8081"
+ echo " webapp run a local Nutch Web Application on locahost:8080"
+ echo " warc exports crawled data from segments at the WARC format"
+ echo " updatehostdb update the host db with records from the crawl db"
+ echo " readhostdb read / dump host db"
+ echo " or"
+ echo " CLASSNAME run the class named CLASSNAME"
+ echo "Most commands print help when invoked w/o parameters."
+ exit 1
+fi
+
+# get arguments
+COMMAND=$1
+shift
+
+# some directories
+THIS_DIR="`dirname "$THIS"`"
+NUTCH_HOME="`cd "$THIS_DIR/.." ; pwd`"
+
+# some Java parameters
+if [ "$NUTCH_JAVA_HOME" != "" ]; then
+ #echo "run java in $NUTCH_JAVA_HOME"
+ JAVA_HOME="$NUTCH_JAVA_HOME"
+fi
+
+if [ "$JAVA_HOME" = "" ]; then
+ echo "Error: JAVA_HOME is not set."
+ exit 1
+fi
+
+local=true
+
+# NUTCH_JOB
+if [ -f "${NUTCH_HOME}"/*nutch*.job ]; then
+ local=false
+ for f in "$NUTCH_HOME"/*nutch*.job; do
+ NUTCH_JOB="$f"
+ done
+ # cygwin path translation
+ if $cygwin; then
+ NUTCH_JOB="`cygpath -p -w "$NUTCH_JOB"`"
+ fi
+fi
+
+JAVA="$JAVA_HOME/bin/java"
+JAVA_HEAP_MAX=-Xmx1000m
+
+# check envvars which might override default args
+if [ "$NUTCH_HEAPSIZE" != "" ]; then
+ #echo "run with heapsize $NUTCH_HEAPSIZE"
+ JAVA_HEAP_MAX="-Xmx""$NUTCH_HEAPSIZE""m"
+ #echo $JAVA_HEAP_MAX
+fi
+
+# CLASSPATH initially contains $NUTCH_CONF_DIR, or defaults to $NUTCH_HOME/conf
+CLASSPATH="${NUTCH_CONF_DIR:=$NUTCH_HOME/conf}"
+CLASSPATH="${CLASSPATH}:$JAVA_HOME/lib/tools.jar"
+
+# so that filenames w/ spaces are handled correctly in loops below
+IFS=
+
+# add libs to CLASSPATH
+if $local; then
+ for f in "$NUTCH_HOME"/lib/*.jar; do
+ CLASSPATH="${CLASSPATH}:$f";
+ done
+ # local runtime
+ # add plugins to classpath
+ if [ -d "$NUTCH_HOME/plugins" ]; then
+ CLASSPATH="${NUTCH_HOME}:${CLASSPATH}"
+ fi
+fi
+
+# cygwin path translation
+if $cygwin; then
+ CLASSPATH="`cygpath -p -w "$CLASSPATH"`"
+fi
+
+# setup 'java.library.path' for native-hadoop code if necessary
+# used only in local mode
+JAVA_LIBRARY_PATH=''
+if [ -d "${NUTCH_HOME}/lib/native" ]; then
+
+ JAVA_PLATFORM=`"${JAVA}" -classpath "$CLASSPATH" org.apache.hadoop.util.PlatformName | sed -e 's/ /_/g'`
+
+ if [ -d "${NUTCH_HOME}/lib/native" ]; then
+ if [ "x$JAVA_LIBRARY_PATH" != "x" ]; then
+ JAVA_LIBRARY_PATH="${JAVA_LIBRARY_PATH}:${NUTCH_HOME}/lib/native/${JAVA_PLATFORM}"
+ else
+ JAVA_LIBRARY_PATH="${NUTCH_HOME}/lib/native/${JAVA_PLATFORM}"
+ fi
+ fi
+fi
+
+if [ $cygwin = true -a "X${JAVA_LIBRARY_PATH}" != "X" ]; then
+ JAVA_LIBRARY_PATH="`cygpath -p -w "$JAVA_LIBRARY_PATH"`"
+fi
+
+# restore ordinary behaviour
+unset IFS
+
+# default log directory & file
+if [ "$NUTCH_LOG_DIR" = "" ]; then
+ NUTCH_LOG_DIR="$NUTCH_HOME/logs"
+fi
+if [ "$NUTCH_LOGFILE" = "" ]; then
+ NUTCH_LOGFILE='hadoop.log'
+fi
+
+#Fix log path under cygwin
+if $cygwin; then
+ NUTCH_LOG_DIR="`cygpath -p -w "$NUTCH_LOG_DIR"`"
+fi
+
+NUTCH_OPTS=($NUTCH_OPTS -Dhadoop.log.dir="$NUTCH_LOG_DIR")
+NUTCH_OPTS=("${NUTCH_OPTS[@]}" -Dhadoop.log.file="$NUTCH_LOGFILE")
+
+if [ "x$JAVA_LIBRARY_PATH" != "x" ]; then
+ NUTCH_OPTS=("${NUTCH_OPTS[@]}" -Djava.library.path="$JAVA_LIBRARY_PATH")
+fi
+
+# figure out which class to run
+if [ "$COMMAND" = "crawl" ] ; then
+ echo "Command $COMMAND is deprecated, please use bin/crawl instead"
+ exit -1
+elif [ "$COMMAND" = "inject" ] ; then
+ CLASS=org.apache.nutch.crawl.Injector
+elif [ "$COMMAND" = "generate" ] ; then
+ CLASS=org.apache.nutch.crawl.Generator
+elif [ "$COMMAND" = "freegen" ] ; then
+ CLASS=org.apache.nutch.tools.FreeGenerator
+elif [ "$COMMAND" = "fetch" ] ; then
+ CLASS=org.apache.nutch.fetcher.Fetcher
+elif [ "$COMMAND" = "parse" ] ; then
+ CLASS=org.apache.nutch.parse.ParseSegment
+elif [ "$COMMAND" = "readdb" ] ; then
+ CLASS=org.apache.nutch.crawl.CrawlDbReader
+elif [ "$COMMAND" = "mergedb" ] ; then
+ CLASS=org.apache.nutch.crawl.CrawlDbMerger
+elif [ "$COMMAND" = "readlinkdb" ] ; then
+ CLASS=org.apache.nutch.crawl.LinkDbReader
+elif [ "$COMMAND" = "readseg" ] ; then
+ CLASS=org.apache.nutch.segment.SegmentReader
+elif [ "$COMMAND" = "mergesegs" ] ; then
+ CLASS=org.apache.nutch.segment.SegmentMerger
+elif [ "$COMMAND" = "updatedb" ] ; then
+ CLASS=org.apache.nutch.crawl.CrawlDb
+elif [ "$COMMAND" = "invertlinks" ] ; then
+ CLASS=org.apache.nutch.crawl.LinkDb
+elif [ "$COMMAND" = "mergelinkdb" ] ; then
+ CLASS=org.apache.nutch.crawl.LinkDbMerger
+elif [ "$COMMAND" = "dump" ] ; then
+ CLASS=org.apache.nutch.tools.FileDumper
+elif [ "$COMMAND" = "commoncrawldump" ] ; then
+ CLASS=org.apache.nutch.tools.CommonCrawlDataDumper
+elif [ "$COMMAND" = "solrindex" ] ; then
+ CLASS="org.apache.nutch.indexer.IndexingJob -D solr.server.url=$1"
+ shift
+elif [ "$COMMAND" = "index" ] ; then
+ CLASS=org.apache.nutch.indexer.IndexingJob
+elif [ "$COMMAND" = "solrdedup" ] ; then
+ echo "Command $COMMAND is deprecated, please use dedup instead"
+ exit -1
+elif [ "$COMMAND" = "dedup" ] ; then
+ CLASS=org.apache.nutch.crawl.DeduplicationJob
+elif [ "$COMMAND" = "solrclean" ] ; then
+ CLASS="org.apache.nutch.indexer.CleaningJob -D solr.server.url=$2 $1"
+ shift; shift
+elif [ "$COMMAND" = "clean" ] ; then
+ CLASS=org.apache.nutch.indexer.CleaningJob
+elif [ "$COMMAND" = "parsechecker" ] ; then
+ CLASS=org.apache.nutch.parse.ParserChecker
+elif [ "$COMMAND" = "indexchecker" ] ; then
+ CLASS=org.apache.nutch.indexer.IndexingFiltersChecker
+elif [ "$COMMAND" = "filterchecker" ] ; then
+ CLASS=org.apache.nutch.net.URLFilterChecker
+elif [ "$COMMAND" = "normalizerchecker" ] ; then
+ CLASS=org.apache.nutch.net.URLNormalizerChecker
+elif [ "$COMMAND" = "domainstats" ] ; then
+ CLASS=org.apache.nutch.util.domain.DomainStatistics
+elif [ "$COMMAND" = "protocolstats" ] ; then
+ CLASS=org.apache.nutch.util.ProtocolStatusStatistics
+elif [ "$COMMAND" = "crawlcomplete" ] ; then
+ CLASS=org.apache.nutch.util.CrawlCompletionStats
+elif [ "$COMMAND" = "webgraph" ] ; then
+ CLASS=org.apache.nutch.scoring.webgraph.WebGraph
+elif [ "$COMMAND" = "linkrank" ] ; then
+ CLASS=org.apache.nutch.scoring.webgraph.LinkRank
+elif [ "$COMMAND" = "scoreupdater" ] ; then
+ CLASS=org.apache.nutch.scoring.webgraph.ScoreUpdater
+elif [ "$COMMAND" = "nodedumper" ] ; then
+ CLASS=org.apache.nutch.scoring.webgraph.NodeDumper
+elif [ "$COMMAND" = "plugin" ] ; then
+ CLASS=org.apache.nutch.plugin.PluginRepository
+elif [ "$COMMAND" = "junit" ] ; then
+ CLASSPATH="$CLASSPATH:$NUTCH_HOME/test/classes/"
+ if $local; then
+ for f in "$NUTCH_HOME"/test/lib/*.jar; do
+ CLASSPATH="${CLASSPATH}:$f";
+ done
+ fi
+ CLASS=org.junit.runner.JUnitCore
+elif [ "$COMMAND" = "startserver" ] ; then
+ CLASS=org.apache.nutch.service.NutchServer
+elif [ "$COMMAND" = "webapp" ] ; then
+ CLASS=org.apache.nutch.webui.NutchUiServer
+elif [ "$COMMAND" = "warc" ] ; then
+ CLASS=org.apache.nutch.tools.warc.WARCExporter
+elif [ "$COMMAND" = "updatehostdb" ] ; then
+ CLASS=org.apache.nutch.hostdb.UpdateHostDb
+elif [ "$COMMAND" = "readhostdb" ] ; then
+ CLASS=org.apache.nutch.hostdb.ReadHostDb
+else
+ CLASS=$COMMAND
+fi
+
+# distributed mode
+EXEC_CALL=(hadoop jar "$NUTCH_JOB")
+
+if $local; then
+ EXEC_CALL=("$JAVA" $JAVA_HEAP_MAX "${NUTCH_OPTS[@]}" -classpath "$CLASSPATH")
+else
+ # check that hadoop can be found on the path
+ if [ $(which hadoop | wc -l ) -eq 0 ]; then
+ echo "Can't find Hadoop executable. Add HADOOP_COMMON_HOME/bin to the path or run in local mode."
+ exit -1;
+ fi
+fi
+
+# run it
+exec "${EXEC_CALL[@]}" $CLASS "$@"
+
http://git-wip-us.apache.org/repos/asf/nutch/blob/020f581a/nutch-core/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-core/pom.xml b/nutch-core/pom.xml
index e358f50..ad5c3af 100644
--- a/nutch-core/pom.xml
+++ b/nutch-core/pom.xml
@@ -113,7 +113,8 @@
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<slf4j.version>1.7.12</slf4j.version>
<junit.version>4.12</junit.version>
- <libs.dir>${project.parent.basedir}${file.separator}${libs.subdir}</libs.dir>
+ <dir.root>${project.parent.basedir}</dir.root>
+ <libs.dir>${dir.local}${file.separator}lib</libs.dir>
</properties>
<dependencies>
@@ -468,7 +469,32 @@
</execution>
</executions>
</plugin>
+ <plugin>
+ <artifactId>maven-resources-plugin</artifactId>
+ <version>3.0.1</version>
+ <executions>
+ <execution>
+ <id>copy-resources</id>
+ <phase>package</phase>
+ <goals>
+ <goal>copy-resources</goal>
+ </goals>
+ <configuration>
+ <outputDirectory>${libs.dir}</outputDirectory>
+ <resources>
+ <resource>
+ <directory>${project.build.directory}</directory>
+ <include>${build.finalName}.jar</include>
+ </resource>
+ <resource>
+ <directory>${project.basedir}</directory>
+ <include>plugin.xml</include>
+ </resource>
+ </resources>
+ </configuration>
+ </execution>
+ </executions>
+ </plugin>
</plugins>
</build>
-
</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/020f581a/nutch-plugins/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/pom.xml b/nutch-plugins/pom.xml
index e6a6abd..fa7adb7 100644
--- a/nutch-plugins/pom.xml
+++ b/nutch-plugins/pom.xml
@@ -32,6 +32,7 @@
<url>http://nutch.apache.org</url>
<modules>
+ <!--<module>indexer-solr</module>-->
<module>creativecommons</module>
<module>feed</module>
<module>headings</module>
@@ -101,7 +102,9 @@
</modules>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
- <libs.dir>..${file.separator}..${file.separator}${libs.subdir}</libs.dir> <!-- Note : one additional level is for the child modules-->
+ <!-- Note : an additional level is for the child modules (defined ahead in hierarchy)-->
+ <dir.root>..${file.separator}..${file.separator}</dir.root>
+ <libs.dir>${dir.local.plugins}${file.separator}${project.artifactId}</libs.dir>
</properties>
<dependencies>
<dependency>
@@ -118,5 +121,35 @@
<type>test-jar</type>
</dependency>
</dependencies>
-
+ <build>
+ <finalName>${project.artifactId}</finalName>
+ <plugins>
+ <plugin>
+ <artifactId>maven-resources-plugin</artifactId>
+ <version>3.0.1</version>
+ <executions>
+ <execution>
+ <id>copy-resources</id>
+ <phase>package</phase>
+ <goals>
+ <goal>copy-resources</goal>
+ </goals>
+ <configuration>
+ <outputDirectory>${libs.dir}</outputDirectory>
+ <resources>
+ <resource>
+ <directory>${project.build.directory}</directory>
+ <include>${build.finalName}.jar</include>
+ </resource>
+ <resource>
+ <directory>${project.basedir}</directory>
+ <include>plugin.xml</include>
+ </resource>
+ </resources>
+ </configuration>
+ </execution>
+ </executions>
+ </plugin>
+ </plugins>
+ </build>
</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/020f581a/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index 268ab2d..18e22c7 100644
--- a/pom.xml
+++ b/pom.xml
@@ -10,8 +10,10 @@
<packaging>pom</packaging>
<properties>
- <libs.subdir>runtime${file.separator}local${file.separator}lib</libs.subdir>
- <libs.dir>${project.basedir}${file.separator}${libs.subdir}</libs.dir>
+ <dir.root>${project.basedir}</dir.root>
+ <dir.local>${dir.root}${file.separator}runtime${file.separator}local</dir.local>
+ <dir.local.libs>${dir.local}${file.separator}libs</dir.local.libs>
+ <dir.local.plugins>${dir.local}${file.separator}plugins</dir.local.plugins>
<junit.version>4.12</junit.version>
</properties>
<modules>
@@ -37,6 +39,14 @@
<directory>runtime</directory>
<followSymlinks>false</followSymlinks>
</fileset>
+ <fileset>
+ <directory>${dir.local.libs}</directory>
+ <includes>
+ <include>**/*.jar</include>
+ <include>**/*.xml</include>
+ </includes>
+ <followSymlinks>false</followSymlinks>
+ </fileset>
</filesets>
</configuration>
</plugin>
@@ -61,6 +71,43 @@
</execution>
</executions>
</plugin>
+ <plugin>
+ <artifactId>maven-resources-plugin</artifactId>
+ <version>3.0.1</version>
+ <executions>
+ <execution>
+ <id>copy-scripts</id>
+ <phase>package</phase>
+ <goals>
+ <goal>copy-resources</goal>
+ </goals>
+ <configuration>
+ <outputDirectory>${dir.local}${file.separator}bin</outputDirectory>
+ <resources>
+ <resource>
+ <directory>bin</directory>
+ <!-- This plugin doesn't preserve permissions, so the scripts aren't executable-->
+ </resource>
+ </resources>
+ </configuration>
+ </execution>
+ <execution>
+ <id>copy-conf</id>
+ <phase>package</phase>
+ <goals>
+ <goal>copy-resources</goal>
+ </goals>
+ <configuration>
+ <outputDirectory>${dir.local}${file.separator}conf</outputDirectory>
+ <resources>
+ <resource>
+ <directory>conf</directory>
+ </resource>
+ </resources>
+ </configuration>
+ </execution>
+ </executions>
+ </plugin>
</plugins>
</build>
<dependencies>
http://git-wip-us.apache.org/repos/asf/nutch/blob/020f581a/src/bin/crawl
----------------------------------------------------------------------
diff --git a/src/bin/crawl b/src/bin/crawl
deleted file mode 100755
index 567d35e..0000000
--- a/src/bin/crawl
+++ /dev/null
@@ -1,281 +0,0 @@
-#!/bin/bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# Usage: crawl [-i|--index] [-D "key=value"] [-w|--wait] <Seed Dir> <Crawl Dir> <Num Rounds>
-# -i|--index Indexes crawl results into a configured indexer
-# -w|--wait NUMBER[SUFFIX] Time to wait before generating a new segment when no URLs
-# are scheduled for fetching. Suffix can be: s for second,
-# m for minute, h for hour and d for day. If no suffix is
-# specified second is used by default.
-# -D A Java property to pass to Nutch calls
-# Seed Dir Directory in which to look for a seeds file
-# Crawl Dir Directory where the crawl/link/segments dirs are saved
-# Num Rounds The number of rounds to run this crawl for
-#
-#
-# UNLIKE THE NUTCH ALL-IN-ONE-CRAWL COMMAND THIS SCRIPT DOES THE LINK INVERSION AND
-# INDEXING FOR EACH SEGMENT
-
-INDEXFLAG=false
-JAVA_PROPERTIES=""
-WAIT=-1 # don't wait if there are no URLs to fetch
-
-function __to_seconds() {
- NUMBER=$(echo $1 | tr -dc '0-9')
- MODIFIER=$(echo $1 | tr -dc '[^s|h|m|d]]')
-
- case $MODIFIER in
- m|M)
- SECONDS=`expr $NUMBER \* 60`
- ;;
- h|H)
- SECONDS=`expr $NUMBER \* 120`
- ;;
- d|D)
- SECONDS=`expr $NUMBER \* 86400`
- ;;
- s|S|*)
- SECONDS=$NUMBER
- ;;
- esac
-
- echo $SECONDS
-}
-
-while [[ $# > 0 ]]
-do
- case $1 in
- -i|--index)
- INDEXFLAG=true
- shift
- ;;
- -D)
- JAVA_PROPERTIES="-D${2} ${JAVA_PROPERTIES}"
- shift 2
- ;;
- -w|--wait)
- WAIT="${2}"
- shift 2
- ;;
- *)
- break
- ;;
- esac
-done
-
-if [[ $# != 3 ]]; then
- echo "Usage: crawl [-i|--index] [-D \"key=value\"] [-w|--wait] <Seed Dir> <Crawl Dir> <Num Rounds>"
- echo -e "\t-i|--index\tIndexes crawl results into a configured indexer"
- echo -e "\t-D\t\tA Java property to pass to Nutch calls"
- echo -e "\t-w|--wait\tNUMBER[SUFFIX] Time to wait before generating a new segment when no URLs"
- echo -e "\t\t\tare scheduled for fetching. Suffix can be: s for second,"
- echo -e "\t\t\tm for minute, h for hour and d for day. If no suffix is"
- echo -e "\t\t\tspecified second is used by default."
- echo -e "\tSeed Dir\tDirectory in which to look for a seeds file"
- echo -e "\tCrawl Dir\tDirectory where the crawl/link/segments dirs are saved"
- echo -e "\tNum Rounds\tThe number of rounds to run this crawl for"
- exit 1
-fi
-
-SEEDDIR="$1"
-CRAWL_PATH="$2"
-LIMIT="$3"
-
-# convert wait time to seconds for compatibility reasons
-if [ "$WAIT" != "-1" ]; then
- WAIT=$( __to_seconds "$WAIT" )
- echo "Time to wait (--wait) = $WAIT sec."
-fi
-
-#############################################
-# MODIFY THE PARAMETERS BELOW TO YOUR NEEDS #
-#############################################
-
-# set the number of slaves nodes
-numSlaves=1
-
-# and the total number of available tasks
-# sets Hadoop parameter "mapreduce.job.reduces"
-numTasks=`expr $numSlaves \* 2`
-
-# number of urls to fetch in one iteration
-# 250K per task?
-sizeFetchlist=`expr $numSlaves \* 50000`
-
-# time limit for feching
-timeLimitFetch=180
-
-# num threads for fetching
-numThreads=50
-
-#############################################
-
-bin="`dirname "$0"`"
-bin="`cd "$bin"; pwd`"
-
-# determines whether mode based on presence of job file
-mode=local
-if [ -f "${bin}"/../*nutch*.job ]; then
- mode=distributed
-fi
-
-# note that some of the options listed here could be set in the
-# corresponding hadoop site xml param file
-commonOptions="-D mapreduce.job.reduces=$numTasks -D mapred.child.java.opts=-Xmx1000m -D mapreduce.reduce.speculative=false -D mapreduce.map.speculative=false -D mapreduce.map.output.compress=true"
-
- # check that hadoop can be found on the path
-if [ $mode = "distributed" ]; then
- if [ $(which hadoop | wc -l ) -eq 0 ]; then
- echo "Can't find Hadoop executable. Add HADOOP_COMMON_HOME/bin to the path or run in local mode."
- exit -1;
- fi
-fi
-
-
-function __bin_nutch {
- # run $bin/nutch, exit if exit value indicates error
-
- echo "$bin/nutch $@" ;# echo command and arguments
- "$bin/nutch" "$@"
-
- RETCODE=$?
- if [ $RETCODE -ne 0 ]
- then
- echo "Error running:"
- echo " $bin/nutch $@"
- echo "Failed with exit value $RETCODE."
- exit $RETCODE
- fi
-}
-
-
-
-# initial injection
-echo "Injecting seed URLs"
-__bin_nutch inject "$CRAWL_PATH"/crawldb "$SEEDDIR"
-
-# main loop : rounds of generate - fetch - parse - update
-for ((a=1; ; a++))
-do
- if [ -e ".STOP" ]
- then
- echo "STOP file found - escaping loop"
- break
- fi
-
- if [ $LIMIT -ne -1 ]; then
- if [ $a -gt $LIMIT ]; then
- echo `date` ": Finished loop with $LIMIT iterations"
- break
- fi
- echo `date` ": Iteration $a of $LIMIT"
- else
- echo `date` ": Iteration $a"
- fi
-
- echo "Generating a new segment"
- generate_args=($commonOptions "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments -topN $sizeFetchlist -numFetchers $numSlaves -noFilter)
- echo "$bin/nutch generate ${generate_args[@]}"
- $bin/nutch generate "${generate_args[@]}"
- RETCODE=$?
- if [ $RETCODE -eq 0 ]; then
- : # ok: no error
- elif [ $RETCODE -eq 1 ]; then
- echo "Generate returned 1 (no new segments created)"
-
- if [ "$WAIT" -ne -1 ]; then
- echo "Waiting for $WAIT sec. ..."
- sleep $WAIT
- continue
- else
- echo "Escaping loop: no more URLs to fetch now"
- break
- fi
- else
- echo "Error running:"
- echo " $bin/nutch generate ${generate_args[@]}"
- echo "Failed with exit value $RETCODE."
- exit $RETCODE
- fi
-
- # capture the name of the segment
- # call hadoop in distributed mode
- # or use ls
-
- if [ $mode = "local" ]; then
- SEGMENT=`ls "$CRAWL_PATH"/segments/ | sort -n | tail -n 1`
- else
- SEGMENT=`hadoop fs -ls "$CRAWL_PATH"/segments/ | grep segments | sed -e "s/\//\\n/g" | egrep 20[0-9]+ | sort -n | tail -n 1`
- fi
-
- echo "Operating on segment : $SEGMENT"
-
- # fetching the segment
- echo "Fetching : $SEGMENT"
- __bin_nutch fetch $commonOptions -D fetcher.timelimit.mins=$timeLimitFetch "$CRAWL_PATH"/segments/$SEGMENT -noParsing -threads $numThreads
-
- # parsing the segment
- echo "Parsing : $SEGMENT"
- # enable the skipping of records for the parsing so that a dodgy document
- # so that it does not fail the full task
- skipRecordsOptions="-D mapreduce.task.skip.start.attempts=2 -D mapreduce.map.skip.maxrecords=1"
- __bin_nutch parse $commonOptions $skipRecordsOptions "$CRAWL_PATH"/segments/$SEGMENT
-
- # updatedb with this segment
- echo "CrawlDB update"
- __bin_nutch updatedb $commonOptions "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments/$SEGMENT
-
-# note that the link inversion - indexing routine can be done within the main loop
-# on a per segment basis
- echo "Link inversion"
- __bin_nutch invertlinks "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT
-
- echo "Dedup on crawldb"
- __bin_nutch dedup "$CRAWL_PATH"/crawldb
-
- if $INDEXFLAG; then
- echo "Indexing $SEGMENT to index"
- __bin_nutch index $JAVA_PROPERTIES "$CRAWL_PATH"/crawldb -linkdb "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT
-
- echo "Cleaning up index if possible"
- __bin_nutch clean $JAVA_PROPERTIES "$CRAWL_PATH"/crawldb
- else
- echo "Skipping indexing ..."
- fi
-
- #######################################################
- # The following commands fall into WebGraph territory
- # and should be uncommented based on your requirements
- #######################################################
- #echo "Building WebGraph within $CRAWL_PATH on all segments in $CRAWL_PATH/segments/"
- #__bin_nutch webgraph $commonOptions -filter -normalize -segmentDir "$CRAWL_PATH"/segments/ -webgraphdb "$CRAWL_PATH"
-
- #echo "Running Loops Job on WebGraph within $CRAWL_PATH"
- #__bin_nutch org.apache.nutch.scoring.webgraph.Loops $commonOptions -webgraphdb "$CRAWL_PATH"
-
- #echo "Running LinkRank Algorithm on WebGraph within $CRAWL_PATH"
- #__bin_nutch linkrank $commonOptions -webgraphdb "$CRAWL_PATH"
-
- #echo "Running ScoreUpdater Job with $CRAWL_PATH/crawldb and WebGraph within $CRAWL_PATH"
- #__bin_nutch scoreupdater $commonOptions -crawldb "$CRAWL_PATH"/crawldb -webgraphdb "$CRAWL_PATH"
-
- #echo "Running NodeDumper on WebGraph within $CRAWL_PATH and dumping output to $CRAWL_PATH/dump/scores"
- #__bin_nutch nodedumper $commonOptions -scores -topn 1000 -webgraphdb "$CRAWL_PATH" -output "$CRAWL_PATH"/dump/scores
-
-done
-
-exit 0
http://git-wip-us.apache.org/repos/asf/nutch/blob/020f581a/src/bin/nutch
----------------------------------------------------------------------
diff --git a/src/bin/nutch b/src/bin/nutch
deleted file mode 100755
index 1649069..0000000
--- a/src/bin/nutch
+++ /dev/null
@@ -1,324 +0,0 @@
-#!/bin/bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# The Nutch command script
-#
-# Environment Variables
-#
-# NUTCH_JAVA_HOME The java implementation to use. Overrides JAVA_HOME.
-#
-# NUTCH_HEAPSIZE The maximum amount of heap to use, in MB.
-# Default is 1000.
-#
-# NUTCH_OPTS Extra Java runtime options.
-# Multiple options must be separated by white space.
-#
-# NUTCH_LOG_DIR Log directory (default: $NUTCH_HOME/logs)
-#
-# NUTCH_LOGFILE Log file (default: hadoop.log)
-#
-# NUTCH_CONF_DIR Path(s) to configuration files (default: $NUTCH_HOME/conf).
-# Multiple paths must be separated by a colon ':'.
-#
-cygwin=false
-case "`uname`" in
-CYGWIN*) cygwin=true;;
-esac
-
-# resolve links - $0 may be a softlink
-THIS="$0"
-while [ -h "$THIS" ]; do
- ls=`ls -ld "$THIS"`
- link=`expr "$ls" : '.*-> \(.*\)$'`
- if expr "$link" : '.*/.*' > /dev/null; then
- THIS="$link"
- else
- THIS=`dirname "$THIS"`/"$link"
- fi
-done
-
-# if no args specified, show usage
-if [ $# = 0 ]; then
- echo "nutch 1.12"
- echo "Usage: nutch COMMAND"
- echo "where COMMAND is one of:"
- echo " readdb read / dump crawl db"
- echo " mergedb merge crawldb-s, with optional filtering"
- echo " readlinkdb read / dump link db"
- echo " inject inject new urls into the database"
- echo " generate generate new segments to fetch from crawl db"
- echo " freegen generate new segments to fetch from text files"
- echo " fetch fetch a segment's pages"
- echo " parse parse a segment's pages"
- echo " readseg read / dump segment data"
- echo " mergesegs merge several segments, with optional filtering and slicing"
- echo " updatedb update crawl db from segments after fetching"
- echo " invertlinks create a linkdb from parsed segments"
- echo " mergelinkdb merge linkdb-s, with optional filtering"
- echo " index run the plugin-based indexer on parsed segments and linkdb"
- echo " dedup deduplicate entries in the crawldb and give them a special status"
- echo " dump exports crawled data from segments into files"
- echo " commoncrawldump exports crawled data from segments into common crawl data format encoded as CBOR"
- echo " solrindex run the solr indexer on parsed segments and linkdb - DEPRECATED use the index command instead"
- echo " solrdedup remove duplicates from solr - DEPRECATED use the dedup command instead"
- echo " solrclean remove HTTP 301 and 404 documents from solr - DEPRECATED use the clean command instead"
- echo " clean remove HTTP 301 and 404 documents and duplicates from indexing backends configured via plugins"
- echo " parsechecker check the parser for a given url"
- echo " indexchecker check the indexing filters for a given url"
- echo " filterchecker check url filters for a given url"
- echo " normalizerchecker check url normalizers for a given url"
- echo " domainstats calculate domain statistics from crawldb"
- echo " protocolstats calculate protocol status code stats from crawldb"
- echo " crawlcomplete calculate crawl completion stats from crawldb"
- echo " webgraph generate a web graph from existing segments"
- echo " linkrank run a link analysis program on the generated web graph"
- echo " scoreupdater updates the crawldb with linkrank scores"
- echo " nodedumper dumps the web graph's node scores"
- echo " plugin load a plugin and run one of its classes main()"
- echo " junit runs the given JUnit test"
- echo " startserver runs the Nutch Server on localhost:8081"
- echo " webapp run a local Nutch Web Application on locahost:8080"
- echo " warc exports crawled data from segments at the WARC format"
- echo " updatehostdb update the host db with records from the crawl db"
- echo " readhostdb read / dump host db"
- echo " or"
- echo " CLASSNAME run the class named CLASSNAME"
- echo "Most commands print help when invoked w/o parameters."
- exit 1
-fi
-
-# get arguments
-COMMAND=$1
-shift
-
-# some directories
-THIS_DIR="`dirname "$THIS"`"
-NUTCH_HOME="`cd "$THIS_DIR/.." ; pwd`"
-
-# some Java parameters
-if [ "$NUTCH_JAVA_HOME" != "" ]; then
- #echo "run java in $NUTCH_JAVA_HOME"
- JAVA_HOME="$NUTCH_JAVA_HOME"
-fi
-
-if [ "$JAVA_HOME" = "" ]; then
- echo "Error: JAVA_HOME is not set."
- exit 1
-fi
-
-local=true
-
-# NUTCH_JOB
-if [ -f "${NUTCH_HOME}"/*nutch*.job ]; then
- local=false
- for f in "$NUTCH_HOME"/*nutch*.job; do
- NUTCH_JOB="$f"
- done
- # cygwin path translation
- if $cygwin; then
- NUTCH_JOB="`cygpath -p -w "$NUTCH_JOB"`"
- fi
-fi
-
-JAVA="$JAVA_HOME/bin/java"
-JAVA_HEAP_MAX=-Xmx1000m
-
-# check envvars which might override default args
-if [ "$NUTCH_HEAPSIZE" != "" ]; then
- #echo "run with heapsize $NUTCH_HEAPSIZE"
- JAVA_HEAP_MAX="-Xmx""$NUTCH_HEAPSIZE""m"
- #echo $JAVA_HEAP_MAX
-fi
-
-# CLASSPATH initially contains $NUTCH_CONF_DIR, or defaults to $NUTCH_HOME/conf
-CLASSPATH="${NUTCH_CONF_DIR:=$NUTCH_HOME/conf}"
-CLASSPATH="${CLASSPATH}:$JAVA_HOME/lib/tools.jar"
-
-# so that filenames w/ spaces are handled correctly in loops below
-IFS=
-
-# add libs to CLASSPATH
-if $local; then
- for f in "$NUTCH_HOME"/lib/*.jar; do
- CLASSPATH="${CLASSPATH}:$f";
- done
- # local runtime
- # add plugins to classpath
- if [ -d "$NUTCH_HOME/plugins" ]; then
- CLASSPATH="${NUTCH_HOME}:${CLASSPATH}"
- fi
-fi
-
-# cygwin path translation
-if $cygwin; then
- CLASSPATH="`cygpath -p -w "$CLASSPATH"`"
-fi
-
-# setup 'java.library.path' for native-hadoop code if necessary
-# used only in local mode
-JAVA_LIBRARY_PATH=''
-if [ -d "${NUTCH_HOME}/lib/native" ]; then
-
- JAVA_PLATFORM=`"${JAVA}" -classpath "$CLASSPATH" org.apache.hadoop.util.PlatformName | sed -e 's/ /_/g'`
-
- if [ -d "${NUTCH_HOME}/lib/native" ]; then
- if [ "x$JAVA_LIBRARY_PATH" != "x" ]; then
- JAVA_LIBRARY_PATH="${JAVA_LIBRARY_PATH}:${NUTCH_HOME}/lib/native/${JAVA_PLATFORM}"
- else
- JAVA_LIBRARY_PATH="${NUTCH_HOME}/lib/native/${JAVA_PLATFORM}"
- fi
- fi
-fi
-
-if [ $cygwin = true -a "X${JAVA_LIBRARY_PATH}" != "X" ]; then
- JAVA_LIBRARY_PATH="`cygpath -p -w "$JAVA_LIBRARY_PATH"`"
-fi
-
-# restore ordinary behaviour
-unset IFS
-
-# default log directory & file
-if [ "$NUTCH_LOG_DIR" = "" ]; then
- NUTCH_LOG_DIR="$NUTCH_HOME/logs"
-fi
-if [ "$NUTCH_LOGFILE" = "" ]; then
- NUTCH_LOGFILE='hadoop.log'
-fi
-
-#Fix log path under cygwin
-if $cygwin; then
- NUTCH_LOG_DIR="`cygpath -p -w "$NUTCH_LOG_DIR"`"
-fi
-
-NUTCH_OPTS=($NUTCH_OPTS -Dhadoop.log.dir="$NUTCH_LOG_DIR")
-NUTCH_OPTS=("${NUTCH_OPTS[@]}" -Dhadoop.log.file="$NUTCH_LOGFILE")
-
-if [ "x$JAVA_LIBRARY_PATH" != "x" ]; then
- NUTCH_OPTS=("${NUTCH_OPTS[@]}" -Djava.library.path="$JAVA_LIBRARY_PATH")
-fi
-
-# figure out which class to run
-if [ "$COMMAND" = "crawl" ] ; then
- echo "Command $COMMAND is deprecated, please use bin/crawl instead"
- exit -1
-elif [ "$COMMAND" = "inject" ] ; then
- CLASS=org.apache.nutch.crawl.Injector
-elif [ "$COMMAND" = "generate" ] ; then
- CLASS=org.apache.nutch.crawl.Generator
-elif [ "$COMMAND" = "freegen" ] ; then
- CLASS=org.apache.nutch.tools.FreeGenerator
-elif [ "$COMMAND" = "fetch" ] ; then
- CLASS=org.apache.nutch.fetcher.Fetcher
-elif [ "$COMMAND" = "parse" ] ; then
- CLASS=org.apache.nutch.parse.ParseSegment
-elif [ "$COMMAND" = "readdb" ] ; then
- CLASS=org.apache.nutch.crawl.CrawlDbReader
-elif [ "$COMMAND" = "mergedb" ] ; then
- CLASS=org.apache.nutch.crawl.CrawlDbMerger
-elif [ "$COMMAND" = "readlinkdb" ] ; then
- CLASS=org.apache.nutch.crawl.LinkDbReader
-elif [ "$COMMAND" = "readseg" ] ; then
- CLASS=org.apache.nutch.segment.SegmentReader
-elif [ "$COMMAND" = "mergesegs" ] ; then
- CLASS=org.apache.nutch.segment.SegmentMerger
-elif [ "$COMMAND" = "updatedb" ] ; then
- CLASS=org.apache.nutch.crawl.CrawlDb
-elif [ "$COMMAND" = "invertlinks" ] ; then
- CLASS=org.apache.nutch.crawl.LinkDb
-elif [ "$COMMAND" = "mergelinkdb" ] ; then
- CLASS=org.apache.nutch.crawl.LinkDbMerger
-elif [ "$COMMAND" = "dump" ] ; then
- CLASS=org.apache.nutch.tools.FileDumper
-elif [ "$COMMAND" = "commoncrawldump" ] ; then
- CLASS=org.apache.nutch.tools.CommonCrawlDataDumper
-elif [ "$COMMAND" = "solrindex" ] ; then
- CLASS="org.apache.nutch.indexer.IndexingJob -D solr.server.url=$1"
- shift
-elif [ "$COMMAND" = "index" ] ; then
- CLASS=org.apache.nutch.indexer.IndexingJob
-elif [ "$COMMAND" = "solrdedup" ] ; then
- echo "Command $COMMAND is deprecated, please use dedup instead"
- exit -1
-elif [ "$COMMAND" = "dedup" ] ; then
- CLASS=org.apache.nutch.crawl.DeduplicationJob
-elif [ "$COMMAND" = "solrclean" ] ; then
- CLASS="org.apache.nutch.indexer.CleaningJob -D solr.server.url=$2 $1"
- shift; shift
-elif [ "$COMMAND" = "clean" ] ; then
- CLASS=org.apache.nutch.indexer.CleaningJob
-elif [ "$COMMAND" = "parsechecker" ] ; then
- CLASS=org.apache.nutch.parse.ParserChecker
-elif [ "$COMMAND" = "indexchecker" ] ; then
- CLASS=org.apache.nutch.indexer.IndexingFiltersChecker
-elif [ "$COMMAND" = "filterchecker" ] ; then
- CLASS=org.apache.nutch.net.URLFilterChecker
-elif [ "$COMMAND" = "normalizerchecker" ] ; then
- CLASS=org.apache.nutch.net.URLNormalizerChecker
-elif [ "$COMMAND" = "domainstats" ] ; then
- CLASS=org.apache.nutch.util.domain.DomainStatistics
-elif [ "$COMMAND" = "protocolstats" ] ; then
- CLASS=org.apache.nutch.util.ProtocolStatusStatistics
-elif [ "$COMMAND" = "crawlcomplete" ] ; then
- CLASS=org.apache.nutch.util.CrawlCompletionStats
-elif [ "$COMMAND" = "webgraph" ] ; then
- CLASS=org.apache.nutch.scoring.webgraph.WebGraph
-elif [ "$COMMAND" = "linkrank" ] ; then
- CLASS=org.apache.nutch.scoring.webgraph.LinkRank
-elif [ "$COMMAND" = "scoreupdater" ] ; then
- CLASS=org.apache.nutch.scoring.webgraph.ScoreUpdater
-elif [ "$COMMAND" = "nodedumper" ] ; then
- CLASS=org.apache.nutch.scoring.webgraph.NodeDumper
-elif [ "$COMMAND" = "plugin" ] ; then
- CLASS=org.apache.nutch.plugin.PluginRepository
-elif [ "$COMMAND" = "junit" ] ; then
- CLASSPATH="$CLASSPATH:$NUTCH_HOME/test/classes/"
- if $local; then
- for f in "$NUTCH_HOME"/test/lib/*.jar; do
- CLASSPATH="${CLASSPATH}:$f";
- done
- fi
- CLASS=org.junit.runner.JUnitCore
-elif [ "$COMMAND" = "startserver" ] ; then
- CLASS=org.apache.nutch.service.NutchServer
-elif [ "$COMMAND" = "webapp" ] ; then
- CLASS=org.apache.nutch.webui.NutchUiServer
-elif [ "$COMMAND" = "warc" ] ; then
- CLASS=org.apache.nutch.tools.warc.WARCExporter
-elif [ "$COMMAND" = "updatehostdb" ] ; then
- CLASS=org.apache.nutch.hostdb.UpdateHostDb
-elif [ "$COMMAND" = "readhostdb" ] ; then
- CLASS=org.apache.nutch.hostdb.ReadHostDb
-else
- CLASS=$COMMAND
-fi
-
-# distributed mode
-EXEC_CALL=(hadoop jar "$NUTCH_JOB")
-
-if $local; then
- EXEC_CALL=("$JAVA" $JAVA_HEAP_MAX "${NUTCH_OPTS[@]}" -classpath "$CLASSPATH")
-else
- # check that hadoop can be found on the path
- if [ $(which hadoop | wc -l ) -eq 0 ]; then
- echo "Can't find Hadoop executable. Add HADOOP_COMMON_HOME/bin to the path or run in local mode."
- exit -1;
- fi
-fi
-
-# run it
-exec "${EXEC_CALL[@]}" $CLASS "$@"
-
[2/2] nutch git commit: Convert tests which requires plugin.folders
system prop to integration Tests
Posted by th...@apache.org.
Convert tests which requires plugin.folders system prop to integration Tests
Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/9f3ba3ed
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/9f3ba3ed
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/9f3ba3ed
Branch: refs/heads/NUTCH-2292
Commit: 9f3ba3eda59219eabe7020f2c65b505dbc46d947
Parents: 020f581
Author: Thamme Gowda <th...@apache.org>
Authored: Sun Jul 10 19:07:57 2016 -0700
Committer: Thamme Gowda <th...@apache.org>
Committed: Sun Jul 10 19:07:57 2016 -0700
----------------------------------------------------------------------
nutch-core/pom.xml | 22 ++++++++++++
.../nutch/crawl/TODOTestCrawlDbStates.java | 3 ++
.../apache/nutch/crawl/TestCrawlDbFilter.java | 3 ++
.../apache/nutch/crawl/TestCrawlDbMerger.java | 3 ++
.../apache/nutch/crawl/TestCrawlDbStates.java | 3 ++
.../org/apache/nutch/crawl/TestGenerator.java | 3 ++
.../org/apache/nutch/crawl/TestInjector.java | 3 ++
.../org/apache/nutch/fetcher/TestFetcher.java | 3 ++
.../nutch/indexer/TestIndexerMapReduce.java | 3 ++
.../nutch/indexer/TestIndexingFilters.java | 3 ++
.../org/apache/nutch/net/TestURLFilters.java | 3 ++
.../apache/nutch/net/TestURLNormalizers.java | 3 ++
.../apache/nutch/parse/TestParserFactory.java | 3 ++
.../apache/nutch/plugin/TestPluginSystem.java | 3 ++
.../nutch/protocol/TestProtocolFactory.java | 3 ++
.../nutch/tools/TestCommonCrawlDataDumper.java | 5 +--
.../org/apache/nutch/util/TestMimeUtil.java | 12 +++++--
nutch-plugins/pom.xml | 9 +++++
pom.xml | 36 ++++++++++++++++++++
19 files changed, 122 insertions(+), 4 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/nutch/blob/9f3ba3ed/nutch-core/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-core/pom.xml b/nutch-core/pom.xml
index ad5c3af..62e2e58 100644
--- a/nutch-core/pom.xml
+++ b/nutch-core/pom.xml
@@ -456,6 +456,11 @@
</dependencies>
<build>
+ <resources>
+ <resource>
+ <directory>${project.parent.basedir}${file.separator}conf</directory>
+ </resource>
+ </resources>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
@@ -495,6 +500,23 @@
</execution>
</executions>
</plugin>
+ <plugin>
+ <artifactId>maven-surefire-plugin</artifactId>
+ <version>2.19.1</version>
+ <configuration>
+ <excludedGroups>org.apache.nutch.test.IntegrationTest</excludedGroups>
+ </configuration>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-failsafe-plugin</artifactId>
+ <version>2.19.1</version>
+ <configuration>
+ <systemPropertyVariables>
+ <plugin.folders>../runtime/local/plugins</plugin.folders>
+ </systemPropertyVariables>
+ </configuration>
+ </plugin>
</plugins>
</build>
</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/9f3ba3ed/nutch-core/src/test/java/org/apache/nutch/crawl/TODOTestCrawlDbStates.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/crawl/TODOTestCrawlDbStates.java b/nutch-core/src/test/java/org/apache/nutch/crawl/TODOTestCrawlDbStates.java
index e44cb39..fd88c7d 100644
--- a/nutch-core/src/test/java/org/apache/nutch/crawl/TODOTestCrawlDbStates.java
+++ b/nutch-core/src/test/java/org/apache/nutch/crawl/TODOTestCrawlDbStates.java
@@ -3,14 +3,17 @@ package org.apache.nutch.crawl;
import static org.apache.nutch.crawl.CrawlDatum.*;
import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.test.IntegrationTest;
import org.apache.nutch.util.TimingUtil;
import static org.junit.Assert.*;
import org.junit.Test;
+import org.junit.experimental.categories.Category;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+@Category({ IntegrationTest.class})
public class TODOTestCrawlDbStates extends TestCrawlDbStates {
private static final Logger LOG = LoggerFactory
http://git-wip-us.apache.org/repos/asf/nutch/blob/9f3ba3ed/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbFilter.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbFilter.java b/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbFilter.java
index 38c38ed..773dd29 100644
--- a/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbFilter.java
+++ b/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbFilter.java
@@ -28,11 +28,13 @@ import org.apache.hadoop.io.SequenceFile.Reader.Option;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.*;
import org.apache.nutch.crawl.CrawlDBTestUtil.URLCrawlDatum;
+import org.apache.nutch.test.IntegrationTest;
import org.apache.nutch.util.NutchJob;
import org.junit.After;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
+import org.junit.experimental.categories.Category;
/**
* CrawlDbFiltering test which tests for correct, error free url normalization
@@ -73,6 +75,7 @@ public class TestCrawlDbFilter {
* @throws Exception
*/
@Test
+ @Category({IntegrationTest.class})
public void testUrl404Purging() throws Exception {
// create a CrawlDatum with DB GONE status
ArrayList<URLCrawlDatum> list = new ArrayList<URLCrawlDatum>();
http://git-wip-us.apache.org/repos/asf/nutch/blob/9f3ba3ed/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbMerger.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbMerger.java b/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbMerger.java
index b670551..599c353 100644
--- a/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbMerger.java
+++ b/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbMerger.java
@@ -30,11 +30,13 @@ import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.MapFile.Writer.Option;
import org.apache.hadoop.mapred.JobConf;
+import org.apache.nutch.test.IntegrationTest;
import org.apache.nutch.util.NutchConfiguration;
import org.junit.After;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
+import org.junit.experimental.categories.Category;
public class TestCrawlDbMerger {
private static final Logger LOG = Logger.getLogger(CrawlDbMerger.class
@@ -110,6 +112,7 @@ public class TestCrawlDbMerger {
* @throws Exception
*/
@Test
+ @Category({IntegrationTest.class})
public void testMerge() throws Exception {
Path crawldb1 = new Path(testDir, "crawldb1");
Path crawldb2 = new Path(testDir, "crawldb2");
http://git-wip-us.apache.org/repos/asf/nutch/blob/9f3ba3ed/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbStates.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbStates.java b/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbStates.java
index c54559b..b631319 100644
--- a/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbStates.java
+++ b/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbStates.java
@@ -35,8 +35,10 @@ import org.apache.nutch.scoring.ScoringFilters;
import static org.junit.Assert.*;
+import org.apache.nutch.test.IntegrationTest;
import org.junit.Test;
+import org.junit.experimental.categories.Category;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -57,6 +59,7 @@ import org.slf4j.LoggerFactory;
* </ul>
* </li> </ul>
*/
+@Category({IntegrationTest.class})
public class TestCrawlDbStates {
private static final Logger LOG = LoggerFactory
http://git-wip-us.apache.org/repos/asf/nutch/blob/9f3ba3ed/nutch-core/src/test/java/org/apache/nutch/crawl/TestGenerator.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/crawl/TestGenerator.java b/nutch-core/src/test/java/org/apache/nutch/crawl/TestGenerator.java
index 84e6b28..0ce3c5f 100644
--- a/nutch-core/src/test/java/org/apache/nutch/crawl/TestGenerator.java
+++ b/nutch-core/src/test/java/org/apache/nutch/crawl/TestGenerator.java
@@ -28,10 +28,12 @@ import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.SequenceFile.Reader.Option;
import org.apache.nutch.crawl.CrawlDBTestUtil.URLCrawlDatum;
+import org.apache.nutch.test.IntegrationTest;
import org.junit.After;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
+import org.junit.experimental.categories.Category;
/**
* Basic generator test. 1. Insert entries in crawldb 2. Generates entries to
@@ -39,6 +41,7 @@ import org.junit.Test;
* highest scoring urls are generated
*
*/
+@Category({IntegrationTest.class})
public class TestGenerator {
Configuration conf;
http://git-wip-us.apache.org/repos/asf/nutch/blob/9f3ba3ed/nutch-core/src/test/java/org/apache/nutch/crawl/TestInjector.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/crawl/TestInjector.java b/nutch-core/src/test/java/org/apache/nutch/crawl/TestInjector.java
index 7293cbb..59a3e8c 100644
--- a/nutch-core/src/test/java/org/apache/nutch/crawl/TestInjector.java
+++ b/nutch-core/src/test/java/org/apache/nutch/crawl/TestInjector.java
@@ -29,10 +29,12 @@ import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.SequenceFile.Reader.Option;
+import org.apache.nutch.test.IntegrationTest;
import org.junit.After;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
+import org.junit.experimental.categories.Category;
/**
* Basic injector test: 1. Creates a text file with urls 2. Injects them into
@@ -40,6 +42,7 @@ import org.junit.Test;
* into webdb 5. Reads crawldb entries and verifies contents
*
*/
+@Category({IntegrationTest.class})
public class TestInjector {
private Configuration conf;
http://git-wip-us.apache.org/repos/asf/nutch/blob/9f3ba3ed/nutch-core/src/test/java/org/apache/nutch/fetcher/TestFetcher.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/fetcher/TestFetcher.java b/nutch-core/src/test/java/org/apache/nutch/fetcher/TestFetcher.java
index fae5f90..a23d080 100644
--- a/nutch-core/src/test/java/org/apache/nutch/fetcher/TestFetcher.java
+++ b/nutch-core/src/test/java/org/apache/nutch/fetcher/TestFetcher.java
@@ -32,10 +32,12 @@ import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.protocol.Content;
+import org.apache.nutch.test.IntegrationTest;
import org.junit.After;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
+import org.junit.experimental.categories.Category;
import org.mortbay.jetty.Server;
/**
@@ -79,6 +81,7 @@ public class TestFetcher {
}
@Test
+ @Category(IntegrationTest.class)
public void testFetch() throws IOException, ClassNotFoundException, InterruptedException {
// generate seedlist
http://git-wip-us.apache.org/repos/asf/nutch/blob/9f3ba3ed/nutch-core/src/test/java/org/apache/nutch/indexer/TestIndexerMapReduce.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/indexer/TestIndexerMapReduce.java b/nutch-core/src/test/java/org/apache/nutch/indexer/TestIndexerMapReduce.java
index d581a0f..3a25f26 100644
--- a/nutch-core/src/test/java/org/apache/nutch/indexer/TestIndexerMapReduce.java
+++ b/nutch-core/src/test/java/org/apache/nutch/indexer/TestIndexerMapReduce.java
@@ -30,8 +30,10 @@ import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseStatus;
import org.apache.nutch.parse.ParseText;
import org.apache.nutch.protocol.Content;
+import org.apache.nutch.test.IntegrationTest;
import org.apache.nutch.util.NutchConfiguration;
import org.junit.Test;
+import org.junit.experimental.categories.Category;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -99,6 +101,7 @@ public class TestIndexerMapReduce {
* Test indexing of base64-encoded binary content.
*/
@Test
+ @Category(IntegrationTest.class)
public void testBinaryContentBase64() {
configuration = NutchConfiguration.create();
configuration.setBoolean(IndexerMapReduce.INDEXER_BINARY_AS_BASE64, true);
http://git-wip-us.apache.org/repos/asf/nutch/blob/9f3ba3ed/nutch-core/src/test/java/org/apache/nutch/indexer/TestIndexingFilters.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/indexer/TestIndexingFilters.java b/nutch-core/src/test/java/org/apache/nutch/indexer/TestIndexingFilters.java
index 4d5849f..14b246b 100644
--- a/nutch-core/src/test/java/org/apache/nutch/indexer/TestIndexingFilters.java
+++ b/nutch-core/src/test/java/org/apache/nutch/indexer/TestIndexingFilters.java
@@ -25,10 +25,13 @@ import org.apache.nutch.parse.Outlink;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseImpl;
import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.test.IntegrationTest;
import org.apache.nutch.util.NutchConfiguration;
import org.junit.Assert;
import org.junit.Test;
+import org.junit.experimental.categories.Category;
+@Category(IntegrationTest.class)
public class TestIndexingFilters {
/**
http://git-wip-us.apache.org/repos/asf/nutch/blob/9f3ba3ed/nutch-core/src/test/java/org/apache/nutch/net/TestURLFilters.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/net/TestURLFilters.java b/nutch-core/src/test/java/org/apache/nutch/net/TestURLFilters.java
index c43941a..ef07907 100644
--- a/nutch-core/src/test/java/org/apache/nutch/net/TestURLFilters.java
+++ b/nutch-core/src/test/java/org/apache/nutch/net/TestURLFilters.java
@@ -17,9 +17,12 @@
package org.apache.nutch.net;
import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.test.IntegrationTest;
import org.apache.nutch.util.NutchConfiguration;
import org.junit.Test;
+import org.junit.experimental.categories.Category;
+@Category(IntegrationTest.class)
public class TestURLFilters {
/**
http://git-wip-us.apache.org/repos/asf/nutch/blob/9f3ba3ed/nutch-core/src/test/java/org/apache/nutch/net/TestURLNormalizers.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/net/TestURLNormalizers.java b/nutch-core/src/test/java/org/apache/nutch/net/TestURLNormalizers.java
index 6fdbb9d..d29e9d3 100644
--- a/nutch-core/src/test/java/org/apache/nutch/net/TestURLNormalizers.java
+++ b/nutch-core/src/test/java/org/apache/nutch/net/TestURLNormalizers.java
@@ -19,10 +19,13 @@ package org.apache.nutch.net;
import java.net.MalformedURLException;
import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.test.IntegrationTest;
import org.apache.nutch.util.NutchConfiguration;
import org.junit.Assert;
import org.junit.Test;
+import org.junit.experimental.categories.Category;
+@Category(IntegrationTest.class)
public class TestURLNormalizers {
@Test
http://git-wip-us.apache.org/repos/asf/nutch/blob/9f3ba3ed/nutch-core/src/test/java/org/apache/nutch/parse/TestParserFactory.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/parse/TestParserFactory.java b/nutch-core/src/test/java/org/apache/nutch/parse/TestParserFactory.java
index 00c524e..198e284 100644
--- a/nutch-core/src/test/java/org/apache/nutch/parse/TestParserFactory.java
+++ b/nutch-core/src/test/java/org/apache/nutch/parse/TestParserFactory.java
@@ -20,10 +20,12 @@ package org.apache.nutch.parse;
// Nutch imports
import org.apache.nutch.plugin.Extension;
import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.test.IntegrationTest;
import org.apache.nutch.util.NutchConfiguration;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
+import org.junit.experimental.categories.Category;
/**
* Unit test for new parse plugin selection.
@@ -31,6 +33,7 @@ import org.junit.Test;
* @author Sebastien Le Callonnec
* @version 1.0
*/
+@Category(IntegrationTest.class)
public class TestParserFactory {
private Configuration conf;
http://git-wip-us.apache.org/repos/asf/nutch/blob/9f3ba3ed/nutch-core/src/test/java/org/apache/nutch/plugin/TestPluginSystem.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/plugin/TestPluginSystem.java b/nutch-core/src/test/java/org/apache/nutch/plugin/TestPluginSystem.java
index a5f4e32..7bcc9ab 100644
--- a/nutch-core/src/test/java/org/apache/nutch/plugin/TestPluginSystem.java
+++ b/nutch-core/src/test/java/org/apache/nutch/plugin/TestPluginSystem.java
@@ -28,16 +28,19 @@ import java.util.Properties;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapred.JobConf;
+import org.apache.nutch.test.IntegrationTest;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.junit.After;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
+import org.junit.experimental.categories.Category;
/**
* Unit tests for the plugin system
*/
+@Category(IntegrationTest.class)
public class TestPluginSystem {
private int fPluginCount;
http://git-wip-us.apache.org/repos/asf/nutch/blob/9f3ba3ed/nutch-core/src/test/java/org/apache/nutch/protocol/TestProtocolFactory.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/protocol/TestProtocolFactory.java b/nutch-core/src/test/java/org/apache/nutch/protocol/TestProtocolFactory.java
index 394c303..6b4c8fd 100644
--- a/nutch-core/src/test/java/org/apache/nutch/protocol/TestProtocolFactory.java
+++ b/nutch-core/src/test/java/org/apache/nutch/protocol/TestProtocolFactory.java
@@ -17,12 +17,15 @@
package org.apache.nutch.protocol;
import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.test.IntegrationTest;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.ObjectCache;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
+import org.junit.experimental.categories.Category;
+@Category(IntegrationTest.class)
public class TestProtocolFactory {
Configuration conf;
http://git-wip-us.apache.org/repos/asf/nutch/blob/9f3ba3ed/nutch-core/src/test/java/org/apache/nutch/tools/TestCommonCrawlDataDumper.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/tools/TestCommonCrawlDataDumper.java b/nutch-core/src/test/java/org/apache/nutch/tools/TestCommonCrawlDataDumper.java
index 1429925..fef0e69 100644
--- a/nutch-core/src/test/java/org/apache/nutch/tools/TestCommonCrawlDataDumper.java
+++ b/nutch-core/src/test/java/org/apache/nutch/tools/TestCommonCrawlDataDumper.java
@@ -19,6 +19,8 @@ package org.apache.nutch.tools;
//Junit imports
import static org.junit.Assert.*;
+
+import org.apache.nutch.test.TestUtils;
import org.junit.Test;
//Commons imports
@@ -43,8 +45,7 @@ public class TestCommonCrawlDataDumper {
@Test
public void testDump() throws Exception {
- File sampleSegmentDir = new File(System.getProperty("test.build.data",
- "."), "test-segments");
+ File sampleSegmentDir = TestUtils.getFile(this, "test-segments");
File tempDir = Files.createTempDirectory("temp").toFile();
String[] crawledFiles = {
http://git-wip-us.apache.org/repos/asf/nutch/blob/9f3ba3ed/nutch-core/src/test/java/org/apache/nutch/util/TestMimeUtil.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/util/TestMimeUtil.java b/nutch-core/src/test/java/org/apache/nutch/util/TestMimeUtil.java
index d0b45db..d812110 100644
--- a/nutch-core/src/test/java/org/apache/nutch/util/TestMimeUtil.java
+++ b/nutch-core/src/test/java/org/apache/nutch/util/TestMimeUtil.java
@@ -18,6 +18,7 @@
package org.apache.nutch.util;
import java.io.File;
+import java.io.FileNotFoundException;
import java.io.IOException;
import java.nio.charset.Charset;
@@ -26,6 +27,7 @@ import org.apache.hadoop.conf.Configuration;
import com.google.common.io.Files;
import junit.framework.TestCase;
+import org.apache.nutch.test.TestUtils;
public class TestMimeUtil extends TestCase {
@@ -33,8 +35,14 @@ public class TestMimeUtil extends TestCase {
private static Charset defaultCharset = Charset.forName("UTF-8");
- private File sampleDir = new File(System.getProperty("test.build.data", "."),
- "test-mime-util");
+ private File sampleDir;
+ {
+ try {
+ sampleDir = TestUtils.getFile(this, "test-mime-util");
+ } catch (FileNotFoundException e){
+ throw new RuntimeException(e);
+ }
+ }
/**
* test data, every element on "test page":
http://git-wip-us.apache.org/repos/asf/nutch/blob/9f3ba3ed/nutch-plugins/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/pom.xml b/nutch-plugins/pom.xml
index fa7adb7..e07f487 100644
--- a/nutch-plugins/pom.xml
+++ b/nutch-plugins/pom.xml
@@ -150,6 +150,15 @@
</execution>
</executions>
</plugin>
+ <plugin>
+ <artifactId>maven-surefire-plugin</artifactId>
+ <version>2.19.1</version>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-failsafe-plugin</artifactId>
+ <version>2.19.1</version>
+ </plugin>
</plugins>
</build>
</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/9f3ba3ed/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index 18e22c7..a3b9271 100644
--- a/pom.xml
+++ b/pom.xml
@@ -109,6 +109,42 @@
</executions>
</plugin>
</plugins>
+ <pluginManagement>
+ <plugins>
+ <plugin>
+ <!-- SureFire is for unit tests, here we exclude integration test group-->
+ <artifactId>maven-surefire-plugin</artifactId>
+ <version>2.19.1</version>
+ <configuration>
+ <excludedGroups>org.apache.nutch.test.IntegrationTest</excludedGroups>
+ </configuration>
+ </plugin>
+ <plugin>
+ <!-- FailSafe is for integration tests, here we just run integration tests-->
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-failsafe-plugin</artifactId>
+ <version>2.19.1</version>
+ <configuration>
+ <systemPropertyVariables>
+ <plugin.folders>${dir.local.plugins}</plugin.folders>
+ </systemPropertyVariables>
+ <includes>
+ <include>**/*.java</include>
+ </includes>
+ <groups>org.apache.nutch.test.IntegrationTest</groups>
+ </configuration>
+ <executions>
+ <execution>
+ <id>integration-test</id>
+ <goals>
+ <goal>integration-test</goal>
+ <goal>verify</goal>
+ </goals>
+ </execution>
+ </executions>
+ </plugin>
+ </plugins>
+ </pluginManagement>
</build>
<dependencies>
<dependency>