You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@nutch.apache.org by th...@apache.org on 2016/07/11 02:07:28 UTC

[1/2] nutch git commit: Reproduced runtime/local build without breaking the backward compatibility

Repository: nutch
Updated Branches:
  refs/heads/NUTCH-2292 9173fd4d6 -> 9f3ba3eda


Reproduced runtime/local build without breaking the backward compatibility


Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/020f581a
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/020f581a
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/020f581a

Branch: refs/heads/NUTCH-2292
Commit: 020f581a2cc735f26d6a423e87da7f7462ed3a35
Parents: 9173fd4
Author: Thamme Gowda <th...@apache.org>
Authored: Sun Jul 10 17:49:51 2016 -0700
Committer: Thamme Gowda <th...@apache.org>
Committed: Sun Jul 10 17:49:51 2016 -0700

----------------------------------------------------------------------
 bin/crawl             | 281 +++++++++++++++++++++++++++++++++++++++
 bin/nutch             | 324 +++++++++++++++++++++++++++++++++++++++++++++
 nutch-core/pom.xml    |  30 ++++-
 nutch-plugins/pom.xml |  37 +++++-
 pom.xml               |  51 ++++++-
 src/bin/crawl         | 281 ---------------------------------------
 src/bin/nutch         | 324 ---------------------------------------------
 7 files changed, 717 insertions(+), 611 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/nutch/blob/020f581a/bin/crawl
----------------------------------------------------------------------
diff --git a/bin/crawl b/bin/crawl
new file mode 100755
index 0000000..567d35e
--- /dev/null
+++ b/bin/crawl
@@ -0,0 +1,281 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Usage: crawl [-i|--index] [-D "key=value"] [-w|--wait] <Seed Dir> <Crawl Dir> <Num Rounds>
+#    -i|--index      Indexes crawl results into a configured indexer
+#    -w|--wait       NUMBER[SUFFIX] Time to wait before generating a new segment when no URLs
+#                    are scheduled for fetching. Suffix can be: s for second,
+#                    m for minute, h for hour and d for day. If no suffix is
+#                    specified second is used by default.
+#    -D              A Java property to pass to Nutch calls
+#    Seed Dir        Directory in which to look for a seeds file
+#    Crawl Dir       Directory where the crawl/link/segments dirs are saved
+#    Num Rounds      The number of rounds to run this crawl for
+#
+#
+# UNLIKE THE NUTCH ALL-IN-ONE-CRAWL COMMAND THIS SCRIPT DOES THE LINK INVERSION AND
+# INDEXING FOR EACH SEGMENT
+
+INDEXFLAG=false
+JAVA_PROPERTIES=""
+WAIT=-1 # don't wait if there are no URLs to fetch
+
+function __to_seconds() {
+  NUMBER=$(echo $1 | tr -dc '0-9')
+  MODIFIER=$(echo $1 | tr -dc '[^s|h|m|d]]')
+
+  case $MODIFIER in
+      m|M)
+        SECONDS=`expr $NUMBER \* 60`
+        ;;
+      h|H)
+        SECONDS=`expr $NUMBER \* 120`
+        ;;
+      d|D)
+        SECONDS=`expr $NUMBER \* 86400`
+        ;;
+      s|S|*)
+        SECONDS=$NUMBER
+        ;;
+  esac
+
+  echo $SECONDS
+}
+
+while [[ $# > 0 ]]
+do
+    case $1 in
+        -i|--index)
+            INDEXFLAG=true
+            shift
+            ;;
+        -D)
+            JAVA_PROPERTIES="-D${2} ${JAVA_PROPERTIES}"
+            shift 2
+            ;;
+        -w|--wait)
+            WAIT="${2}"
+            shift 2
+            ;;
+        *)
+            break
+            ;;
+    esac
+done
+
+if [[ $# != 3 ]]; then
+    echo "Usage: crawl [-i|--index] [-D \"key=value\"] [-w|--wait] <Seed Dir> <Crawl Dir> <Num Rounds>"
+    echo -e "\t-i|--index\tIndexes crawl results into a configured indexer"
+    echo -e "\t-D\t\tA Java property to pass to Nutch calls"
+    echo -e "\t-w|--wait\tNUMBER[SUFFIX] Time to wait before generating a new segment when no URLs"
+    echo -e "\t\t\tare scheduled for fetching. Suffix can be: s for second,"
+    echo -e "\t\t\tm for minute, h for hour and d for day. If no suffix is"
+    echo -e "\t\t\tspecified second is used by default."
+    echo -e "\tSeed Dir\tDirectory in which to look for a seeds file"
+    echo -e "\tCrawl Dir\tDirectory where the crawl/link/segments dirs are saved"
+    echo -e "\tNum Rounds\tThe number of rounds to run this crawl for"
+    exit 1
+fi
+
+SEEDDIR="$1"
+CRAWL_PATH="$2"
+LIMIT="$3"
+
+# convert wait time to seconds for compatibility reasons
+if [ "$WAIT" != "-1" ]; then
+  WAIT=$( __to_seconds "$WAIT" )
+  echo "Time to wait (--wait) = $WAIT sec."
+fi
+
+#############################################
+# MODIFY THE PARAMETERS BELOW TO YOUR NEEDS #
+#############################################
+
+# set the number of slaves nodes
+numSlaves=1
+
+# and the total number of available tasks
+# sets Hadoop parameter "mapreduce.job.reduces"
+numTasks=`expr $numSlaves \* 2`
+
+# number of urls to fetch in one iteration
+# 250K per task?
+sizeFetchlist=`expr $numSlaves \* 50000`
+
+# time limit for feching
+timeLimitFetch=180
+
+# num threads for fetching
+numThreads=50
+
+#############################################
+
+bin="`dirname "$0"`"
+bin="`cd "$bin"; pwd`"
+
+# determines whether mode based on presence of job file
+mode=local
+if [ -f "${bin}"/../*nutch*.job ]; then
+    mode=distributed
+fi
+
+# note that some of the options listed here could be set in the
+# corresponding hadoop site xml param file
+commonOptions="-D mapreduce.job.reduces=$numTasks -D mapred.child.java.opts=-Xmx1000m -D mapreduce.reduce.speculative=false -D mapreduce.map.speculative=false -D mapreduce.map.output.compress=true"
+
+ # check that hadoop can be found on the path
+if [ $mode = "distributed" ]; then
+ if [ $(which hadoop | wc -l ) -eq 0 ]; then
+    echo "Can't find Hadoop executable. Add HADOOP_COMMON_HOME/bin to the path or run in local mode."
+    exit -1;
+ fi
+fi
+
+
+function __bin_nutch {
+    # run $bin/nutch, exit if exit value indicates error
+
+    echo "$bin/nutch $@" ;# echo command and arguments
+    "$bin/nutch" "$@"
+
+    RETCODE=$?
+    if [ $RETCODE -ne 0 ]
+    then
+        echo "Error running:"
+        echo "  $bin/nutch $@"
+        echo "Failed with exit value $RETCODE."
+        exit $RETCODE
+    fi
+}
+
+
+
+# initial injection
+echo "Injecting seed URLs"
+__bin_nutch inject "$CRAWL_PATH"/crawldb "$SEEDDIR"
+
+# main loop : rounds of generate - fetch - parse - update
+for ((a=1; ; a++))
+do
+  if [ -e ".STOP" ]
+  then
+   echo "STOP file found - escaping loop"
+   break
+  fi
+
+  if [ $LIMIT -ne -1 ]; then
+    if [ $a -gt $LIMIT ]; then
+      echo `date` ": Finished loop with $LIMIT iterations"
+      break
+    fi
+    echo `date` ": Iteration $a of $LIMIT"
+  else
+    echo `date` ": Iteration $a"
+  fi
+
+  echo "Generating a new segment"
+  generate_args=($commonOptions "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments -topN $sizeFetchlist -numFetchers $numSlaves -noFilter)
+  echo "$bin/nutch generate ${generate_args[@]}"
+  $bin/nutch generate "${generate_args[@]}"
+  RETCODE=$?
+  if [ $RETCODE -eq 0 ]; then
+      : # ok: no error
+  elif [ $RETCODE -eq 1 ]; then
+    echo "Generate returned 1 (no new segments created)"
+
+    if [ "$WAIT" -ne -1 ]; then
+      echo "Waiting for $WAIT sec. ..."
+      sleep $WAIT
+      continue
+    else
+      echo "Escaping loop: no more URLs to fetch now"
+      break
+    fi
+  else
+    echo "Error running:"
+    echo "  $bin/nutch generate ${generate_args[@]}"
+    echo "Failed with exit value $RETCODE."
+    exit $RETCODE
+  fi
+
+  # capture the name of the segment
+  # call hadoop in distributed mode
+  # or use ls
+
+  if [ $mode = "local" ]; then
+   SEGMENT=`ls "$CRAWL_PATH"/segments/ | sort -n | tail -n 1`
+  else
+   SEGMENT=`hadoop fs -ls "$CRAWL_PATH"/segments/ | grep segments |  sed -e "s/\//\\n/g" | egrep 20[0-9]+ | sort -n | tail -n 1`
+  fi
+
+  echo "Operating on segment : $SEGMENT"
+
+  # fetching the segment
+  echo "Fetching : $SEGMENT"
+  __bin_nutch fetch $commonOptions -D fetcher.timelimit.mins=$timeLimitFetch "$CRAWL_PATH"/segments/$SEGMENT -noParsing -threads $numThreads
+
+  # parsing the segment
+  echo "Parsing : $SEGMENT"
+  # enable the skipping of records for the parsing so that a dodgy document
+  # so that it does not fail the full task
+  skipRecordsOptions="-D mapreduce.task.skip.start.attempts=2 -D mapreduce.map.skip.maxrecords=1"
+  __bin_nutch parse $commonOptions $skipRecordsOptions "$CRAWL_PATH"/segments/$SEGMENT
+
+  # updatedb with this segment
+  echo "CrawlDB update"
+  __bin_nutch updatedb $commonOptions "$CRAWL_PATH"/crawldb  "$CRAWL_PATH"/segments/$SEGMENT
+
+# note that the link inversion - indexing routine can be done within the main loop
+# on a per segment basis
+  echo "Link inversion"
+  __bin_nutch invertlinks "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT
+
+  echo "Dedup on crawldb"
+  __bin_nutch dedup "$CRAWL_PATH"/crawldb
+
+  if $INDEXFLAG; then
+      echo "Indexing $SEGMENT to index"
+      __bin_nutch index $JAVA_PROPERTIES "$CRAWL_PATH"/crawldb -linkdb "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT
+
+      echo "Cleaning up index if possible"
+      __bin_nutch clean $JAVA_PROPERTIES "$CRAWL_PATH"/crawldb
+  else
+      echo "Skipping indexing ..."
+  fi
+
+  #######################################################
+  # The following commands fall into WebGraph territory
+  # and should be uncommented based on your requirements
+  #######################################################
+  #echo "Building WebGraph within $CRAWL_PATH on all segments in $CRAWL_PATH/segments/"
+  #__bin_nutch webgraph $commonOptions -filter -normalize -segmentDir "$CRAWL_PATH"/segments/ -webgraphdb "$CRAWL_PATH"
+
+  #echo "Running Loops Job on WebGraph within $CRAWL_PATH"
+  #__bin_nutch org.apache.nutch.scoring.webgraph.Loops $commonOptions -webgraphdb "$CRAWL_PATH"
+
+  #echo "Running LinkRank Algorithm on WebGraph within $CRAWL_PATH"
+  #__bin_nutch linkrank $commonOptions -webgraphdb "$CRAWL_PATH"
+
+  #echo "Running ScoreUpdater Job with $CRAWL_PATH/crawldb and  WebGraph within $CRAWL_PATH"
+  #__bin_nutch scoreupdater $commonOptions -crawldb "$CRAWL_PATH"/crawldb -webgraphdb "$CRAWL_PATH"
+
+  #echo "Running NodeDumper on WebGraph within $CRAWL_PATH and dumping output to $CRAWL_PATH/dump/scores"
+  #__bin_nutch nodedumper $commonOptions -scores -topn 1000 -webgraphdb "$CRAWL_PATH" -output "$CRAWL_PATH"/dump/scores
+
+done
+
+exit 0

http://git-wip-us.apache.org/repos/asf/nutch/blob/020f581a/bin/nutch
----------------------------------------------------------------------
diff --git a/bin/nutch b/bin/nutch
new file mode 100755
index 0000000..1649069
--- /dev/null
+++ b/bin/nutch
@@ -0,0 +1,324 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# 
+# The Nutch command script
+#
+# Environment Variables
+#
+#   NUTCH_JAVA_HOME The java implementation to use.  Overrides JAVA_HOME.
+#
+#   NUTCH_HEAPSIZE  The maximum amount of heap to use, in MB. 
+#                   Default is 1000.
+#
+#   NUTCH_OPTS      Extra Java runtime options.
+#                   Multiple options must be separated by white space.
+#
+#   NUTCH_LOG_DIR   Log directory (default: $NUTCH_HOME/logs)
+#
+#   NUTCH_LOGFILE   Log file (default: hadoop.log)
+#
+#   NUTCH_CONF_DIR  Path(s) to configuration files (default: $NUTCH_HOME/conf).
+#                   Multiple paths must be separated by a colon ':'.
+#
+cygwin=false
+case "`uname`" in
+CYGWIN*) cygwin=true;;
+esac
+
+# resolve links - $0 may be a softlink
+THIS="$0"
+while [ -h "$THIS" ]; do
+  ls=`ls -ld "$THIS"`
+  link=`expr "$ls" : '.*-> \(.*\)$'`
+  if expr "$link" : '.*/.*' > /dev/null; then
+    THIS="$link"
+  else
+    THIS=`dirname "$THIS"`/"$link"
+  fi
+done
+
+# if no args specified, show usage
+if [ $# = 0 ]; then
+  echo "nutch 1.12"
+  echo "Usage: nutch COMMAND"
+  echo "where COMMAND is one of:"
+  echo "  readdb            read / dump crawl db"
+  echo "  mergedb           merge crawldb-s, with optional filtering"
+  echo "  readlinkdb        read / dump link db"
+  echo "  inject            inject new urls into the database"
+  echo "  generate          generate new segments to fetch from crawl db"
+  echo "  freegen           generate new segments to fetch from text files"
+  echo "  fetch             fetch a segment's pages"
+  echo "  parse             parse a segment's pages"
+  echo "  readseg           read / dump segment data"
+  echo "  mergesegs         merge several segments, with optional filtering and slicing"
+  echo "  updatedb          update crawl db from segments after fetching"
+  echo "  invertlinks       create a linkdb from parsed segments"
+  echo "  mergelinkdb       merge linkdb-s, with optional filtering"
+  echo "  index             run the plugin-based indexer on parsed segments and linkdb"
+  echo "  dedup             deduplicate entries in the crawldb and give them a special status"
+  echo "  dump              exports crawled data from segments into files"
+  echo "  commoncrawldump   exports crawled data from segments into common crawl data format encoded as CBOR"
+  echo "  solrindex         run the solr indexer on parsed segments and linkdb - DEPRECATED use the index command instead"
+  echo "  solrdedup         remove duplicates from solr - DEPRECATED use the dedup command instead"
+  echo "  solrclean         remove HTTP 301 and 404 documents from solr - DEPRECATED use the clean command instead"
+  echo "  clean             remove HTTP 301 and 404 documents and duplicates from indexing backends configured via plugins"
+  echo "  parsechecker      check the parser for a given url"
+  echo "  indexchecker      check the indexing filters for a given url"
+  echo "  filterchecker     check url filters for a given url"
+  echo "  normalizerchecker check url normalizers for a given url"
+  echo "  domainstats       calculate domain statistics from crawldb"
+  echo "  protocolstats     calculate protocol status code stats from crawldb"
+  echo "  crawlcomplete     calculate crawl completion stats from crawldb"
+  echo "  webgraph          generate a web graph from existing segments"
+  echo "  linkrank          run a link analysis program on the generated web graph"
+  echo "  scoreupdater      updates the crawldb with linkrank scores"
+  echo "  nodedumper        dumps the web graph's node scores"
+  echo "  plugin            load a plugin and run one of its classes main()"
+  echo "  junit             runs the given JUnit test"
+  echo "  startserver       runs the Nutch Server on localhost:8081"
+  echo "  webapp            run a local Nutch Web Application on locahost:8080"
+  echo "  warc              exports crawled data from segments at the WARC format"
+  echo "  updatehostdb      update the host db with records from the crawl db"
+  echo "  readhostdb        read / dump host db"
+  echo " or"
+  echo "  CLASSNAME         run the class named CLASSNAME"
+  echo "Most commands print help when invoked w/o parameters."
+  exit 1
+fi
+
+# get arguments
+COMMAND=$1
+shift
+
+# some directories
+THIS_DIR="`dirname "$THIS"`"
+NUTCH_HOME="`cd "$THIS_DIR/.." ; pwd`"
+
+# some Java parameters
+if [ "$NUTCH_JAVA_HOME" != "" ]; then
+  #echo "run java in $NUTCH_JAVA_HOME"
+  JAVA_HOME="$NUTCH_JAVA_HOME"
+fi
+  
+if [ "$JAVA_HOME" = "" ]; then
+  echo "Error: JAVA_HOME is not set."
+  exit 1
+fi
+
+local=true
+
+# NUTCH_JOB 
+if [ -f "${NUTCH_HOME}"/*nutch*.job ]; then
+  local=false
+  for f in "$NUTCH_HOME"/*nutch*.job; do
+    NUTCH_JOB="$f"
+  done
+  # cygwin path translation
+  if $cygwin; then
+	NUTCH_JOB="`cygpath -p -w "$NUTCH_JOB"`"
+  fi
+fi
+
+JAVA="$JAVA_HOME/bin/java"
+JAVA_HEAP_MAX=-Xmx1000m 
+
+# check envvars which might override default args
+if [ "$NUTCH_HEAPSIZE" != "" ]; then
+  #echo "run with heapsize $NUTCH_HEAPSIZE"
+  JAVA_HEAP_MAX="-Xmx""$NUTCH_HEAPSIZE""m"
+  #echo $JAVA_HEAP_MAX
+fi
+
+# CLASSPATH initially contains $NUTCH_CONF_DIR, or defaults to $NUTCH_HOME/conf
+CLASSPATH="${NUTCH_CONF_DIR:=$NUTCH_HOME/conf}"
+CLASSPATH="${CLASSPATH}:$JAVA_HOME/lib/tools.jar"
+
+# so that filenames w/ spaces are handled correctly in loops below
+IFS=
+
+# add libs to CLASSPATH
+if $local; then
+  for f in "$NUTCH_HOME"/lib/*.jar; do
+   CLASSPATH="${CLASSPATH}:$f";
+  done
+  # local runtime
+  # add plugins to classpath
+  if [ -d "$NUTCH_HOME/plugins" ]; then
+     CLASSPATH="${NUTCH_HOME}:${CLASSPATH}"
+  fi
+fi
+
+# cygwin path translation
+if $cygwin; then
+  CLASSPATH="`cygpath -p -w "$CLASSPATH"`"
+fi
+
+# setup 'java.library.path' for native-hadoop code if necessary
+# used only in local mode 
+JAVA_LIBRARY_PATH=''
+if [ -d "${NUTCH_HOME}/lib/native" ]; then
+
+  JAVA_PLATFORM=`"${JAVA}" -classpath "$CLASSPATH" org.apache.hadoop.util.PlatformName | sed -e 's/ /_/g'`
+
+  if [ -d "${NUTCH_HOME}/lib/native" ]; then
+    if [ "x$JAVA_LIBRARY_PATH" != "x" ]; then
+      JAVA_LIBRARY_PATH="${JAVA_LIBRARY_PATH}:${NUTCH_HOME}/lib/native/${JAVA_PLATFORM}"
+    else
+      JAVA_LIBRARY_PATH="${NUTCH_HOME}/lib/native/${JAVA_PLATFORM}"
+    fi
+  fi
+fi
+
+if [ $cygwin = true -a "X${JAVA_LIBRARY_PATH}" != "X" ]; then
+  JAVA_LIBRARY_PATH="`cygpath -p -w "$JAVA_LIBRARY_PATH"`"
+fi
+
+# restore ordinary behaviour
+unset IFS
+
+# default log directory & file
+if [ "$NUTCH_LOG_DIR" = "" ]; then
+  NUTCH_LOG_DIR="$NUTCH_HOME/logs"
+fi
+if [ "$NUTCH_LOGFILE" = "" ]; then
+  NUTCH_LOGFILE='hadoop.log'
+fi
+
+#Fix log path under cygwin
+if $cygwin; then
+  NUTCH_LOG_DIR="`cygpath -p -w "$NUTCH_LOG_DIR"`"
+fi
+
+NUTCH_OPTS=($NUTCH_OPTS -Dhadoop.log.dir="$NUTCH_LOG_DIR")
+NUTCH_OPTS=("${NUTCH_OPTS[@]}" -Dhadoop.log.file="$NUTCH_LOGFILE")
+
+if [ "x$JAVA_LIBRARY_PATH" != "x" ]; then
+  NUTCH_OPTS=("${NUTCH_OPTS[@]}" -Djava.library.path="$JAVA_LIBRARY_PATH")
+fi
+
+# figure out which class to run
+if [ "$COMMAND" = "crawl" ] ; then
+  echo "Command $COMMAND is deprecated, please use bin/crawl instead"
+  exit -1
+elif [ "$COMMAND" = "inject" ] ; then
+  CLASS=org.apache.nutch.crawl.Injector
+elif [ "$COMMAND" = "generate" ] ; then
+  CLASS=org.apache.nutch.crawl.Generator
+elif [ "$COMMAND" = "freegen" ] ; then
+  CLASS=org.apache.nutch.tools.FreeGenerator
+elif [ "$COMMAND" = "fetch" ] ; then
+  CLASS=org.apache.nutch.fetcher.Fetcher
+elif [ "$COMMAND" = "parse" ] ; then
+  CLASS=org.apache.nutch.parse.ParseSegment
+elif [ "$COMMAND" = "readdb" ] ; then
+  CLASS=org.apache.nutch.crawl.CrawlDbReader
+elif [ "$COMMAND" = "mergedb" ] ; then
+  CLASS=org.apache.nutch.crawl.CrawlDbMerger
+elif [ "$COMMAND" = "readlinkdb" ] ; then
+  CLASS=org.apache.nutch.crawl.LinkDbReader
+elif [ "$COMMAND" = "readseg" ] ; then
+  CLASS=org.apache.nutch.segment.SegmentReader
+elif [ "$COMMAND" = "mergesegs" ] ; then
+  CLASS=org.apache.nutch.segment.SegmentMerger
+elif [ "$COMMAND" = "updatedb" ] ; then
+  CLASS=org.apache.nutch.crawl.CrawlDb
+elif [ "$COMMAND" = "invertlinks" ] ; then
+  CLASS=org.apache.nutch.crawl.LinkDb
+elif [ "$COMMAND" = "mergelinkdb" ] ; then
+  CLASS=org.apache.nutch.crawl.LinkDbMerger
+elif [ "$COMMAND" = "dump" ] ; then
+  CLASS=org.apache.nutch.tools.FileDumper
+elif [ "$COMMAND" = "commoncrawldump" ] ; then
+  CLASS=org.apache.nutch.tools.CommonCrawlDataDumper
+elif [ "$COMMAND" = "solrindex" ] ; then
+  CLASS="org.apache.nutch.indexer.IndexingJob -D solr.server.url=$1"
+  shift
+elif [ "$COMMAND" = "index" ] ; then
+  CLASS=org.apache.nutch.indexer.IndexingJob
+elif [ "$COMMAND" = "solrdedup" ] ; then
+  echo "Command $COMMAND is deprecated, please use dedup instead"
+  exit -1
+elif [ "$COMMAND" = "dedup" ] ; then
+  CLASS=org.apache.nutch.crawl.DeduplicationJob
+elif [ "$COMMAND" = "solrclean" ] ; then
+  CLASS="org.apache.nutch.indexer.CleaningJob -D solr.server.url=$2 $1"
+  shift; shift
+elif [ "$COMMAND" = "clean" ] ; then
+  CLASS=org.apache.nutch.indexer.CleaningJob
+elif [ "$COMMAND" = "parsechecker" ] ; then
+  CLASS=org.apache.nutch.parse.ParserChecker
+elif [ "$COMMAND" = "indexchecker" ] ; then
+  CLASS=org.apache.nutch.indexer.IndexingFiltersChecker
+elif [ "$COMMAND" = "filterchecker" ] ; then
+  CLASS=org.apache.nutch.net.URLFilterChecker
+elif [ "$COMMAND" = "normalizerchecker" ] ; then
+  CLASS=org.apache.nutch.net.URLNormalizerChecker
+elif [ "$COMMAND" = "domainstats" ] ; then 
+  CLASS=org.apache.nutch.util.domain.DomainStatistics
+elif [ "$COMMAND" = "protocolstats" ] ; then
+   CLASS=org.apache.nutch.util.ProtocolStatusStatistics
+elif [ "$COMMAND" = "crawlcomplete" ] ; then
+  CLASS=org.apache.nutch.util.CrawlCompletionStats
+elif [ "$COMMAND" = "webgraph" ] ; then
+  CLASS=org.apache.nutch.scoring.webgraph.WebGraph
+elif [ "$COMMAND" = "linkrank" ] ; then
+  CLASS=org.apache.nutch.scoring.webgraph.LinkRank
+elif [ "$COMMAND" = "scoreupdater" ] ; then
+  CLASS=org.apache.nutch.scoring.webgraph.ScoreUpdater
+elif [ "$COMMAND" = "nodedumper" ] ; then
+  CLASS=org.apache.nutch.scoring.webgraph.NodeDumper
+elif [ "$COMMAND" = "plugin" ] ; then
+  CLASS=org.apache.nutch.plugin.PluginRepository
+elif [ "$COMMAND" = "junit" ] ; then
+  CLASSPATH="$CLASSPATH:$NUTCH_HOME/test/classes/"
+  if $local; then
+    for f in "$NUTCH_HOME"/test/lib/*.jar; do
+      CLASSPATH="${CLASSPATH}:$f";
+    done
+  fi
+  CLASS=org.junit.runner.JUnitCore
+elif [ "$COMMAND" = "startserver" ] ; then
+  CLASS=org.apache.nutch.service.NutchServer
+elif [ "$COMMAND" = "webapp" ] ; then
+  CLASS=org.apache.nutch.webui.NutchUiServer
+elif [ "$COMMAND" = "warc" ] ; then
+  CLASS=org.apache.nutch.tools.warc.WARCExporter
+elif [ "$COMMAND" = "updatehostdb" ] ; then
+  CLASS=org.apache.nutch.hostdb.UpdateHostDb
+elif [ "$COMMAND" = "readhostdb" ] ; then
+  CLASS=org.apache.nutch.hostdb.ReadHostDb
+else
+  CLASS=$COMMAND
+fi
+
+# distributed mode
+EXEC_CALL=(hadoop jar "$NUTCH_JOB")
+
+if $local; then
+ EXEC_CALL=("$JAVA" $JAVA_HEAP_MAX "${NUTCH_OPTS[@]}" -classpath "$CLASSPATH")
+else
+ # check that hadoop can be found on the path
+ if [ $(which hadoop | wc -l ) -eq 0 ]; then
+    echo "Can't find Hadoop executable. Add HADOOP_COMMON_HOME/bin to the path or run in local mode."
+    exit -1;
+ fi
+fi
+
+# run it
+exec "${EXEC_CALL[@]}" $CLASS "$@"
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/020f581a/nutch-core/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-core/pom.xml b/nutch-core/pom.xml
index e358f50..ad5c3af 100644
--- a/nutch-core/pom.xml
+++ b/nutch-core/pom.xml
@@ -113,7 +113,8 @@
         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
         <slf4j.version>1.7.12</slf4j.version>
         <junit.version>4.12</junit.version>
-        <libs.dir>${project.parent.basedir}${file.separator}${libs.subdir}</libs.dir>
+        <dir.root>${project.parent.basedir}</dir.root>
+        <libs.dir>${dir.local}${file.separator}lib</libs.dir>
     </properties>
 
     <dependencies>
@@ -468,7 +469,32 @@
                     </execution>
                 </executions>
             </plugin>
+            <plugin>
+                <artifactId>maven-resources-plugin</artifactId>
+                <version>3.0.1</version>
+                <executions>
+                    <execution>
+                        <id>copy-resources</id>
+                        <phase>package</phase>
+                        <goals>
+                            <goal>copy-resources</goal>
+                        </goals>
+                        <configuration>
+                            <outputDirectory>${libs.dir}</outputDirectory>
+                            <resources>
+                                <resource>
+                                    <directory>${project.build.directory}</directory>
+                                    <include>${build.finalName}.jar</include>
+                                </resource>
+                                <resource>
+                                    <directory>${project.basedir}</directory>
+                                    <include>plugin.xml</include>
+                                </resource>
+                            </resources>
+                        </configuration>
+                    </execution>
+                </executions>
+            </plugin>
         </plugins>
     </build>
-
 </project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/020f581a/nutch-plugins/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/pom.xml b/nutch-plugins/pom.xml
index e6a6abd..fa7adb7 100644
--- a/nutch-plugins/pom.xml
+++ b/nutch-plugins/pom.xml
@@ -32,6 +32,7 @@
     <url>http://nutch.apache.org</url>
 
     <modules>
+        <!--<module>indexer-solr</module>-->
         <module>creativecommons</module>
         <module>feed</module>
         <module>headings</module>
@@ -101,7 +102,9 @@
     </modules>
     <properties>
         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
-        <libs.dir>..${file.separator}..${file.separator}${libs.subdir}</libs.dir> <!-- Note : one additional level is for the child modules-->
+        <!-- Note : an additional level is for the child modules (defined ahead in hierarchy)-->
+        <dir.root>..${file.separator}..${file.separator}</dir.root>
+        <libs.dir>${dir.local.plugins}${file.separator}${project.artifactId}</libs.dir>
     </properties>
     <dependencies>
         <dependency>
@@ -118,5 +121,35 @@
             <type>test-jar</type>
         </dependency>
     </dependencies>
-
+    <build>
+        <finalName>${project.artifactId}</finalName>
+        <plugins>
+            <plugin>
+                <artifactId>maven-resources-plugin</artifactId>
+                <version>3.0.1</version>
+                <executions>
+                    <execution>
+                        <id>copy-resources</id>
+                        <phase>package</phase>
+                        <goals>
+                            <goal>copy-resources</goal>
+                        </goals>
+                        <configuration>
+                            <outputDirectory>${libs.dir}</outputDirectory>
+                            <resources>
+                                <resource>
+                                    <directory>${project.build.directory}</directory>
+                                    <include>${build.finalName}.jar</include>
+                                </resource>
+                                <resource>
+                                    <directory>${project.basedir}</directory>
+                                    <include>plugin.xml</include>
+                                </resource>
+                            </resources>
+                        </configuration>
+                    </execution>
+                </executions>
+            </plugin>
+        </plugins>
+    </build>
 </project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/020f581a/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index 268ab2d..18e22c7 100644
--- a/pom.xml
+++ b/pom.xml
@@ -10,8 +10,10 @@
     <packaging>pom</packaging>
 
     <properties>
-        <libs.subdir>runtime${file.separator}local${file.separator}lib</libs.subdir>
-        <libs.dir>${project.basedir}${file.separator}${libs.subdir}</libs.dir>
+        <dir.root>${project.basedir}</dir.root>
+        <dir.local>${dir.root}${file.separator}runtime${file.separator}local</dir.local>
+        <dir.local.libs>${dir.local}${file.separator}libs</dir.local.libs>
+        <dir.local.plugins>${dir.local}${file.separator}plugins</dir.local.plugins>
         <junit.version>4.12</junit.version>
     </properties>
     <modules>
@@ -37,6 +39,14 @@
                             <directory>runtime</directory>
                             <followSymlinks>false</followSymlinks>
                         </fileset>
+                        <fileset>
+                            <directory>${dir.local.libs}</directory>
+                            <includes>
+                                <include>**/*.jar</include>
+                                <include>**/*.xml</include>
+                            </includes>
+                            <followSymlinks>false</followSymlinks>
+                        </fileset>
                     </filesets>
                 </configuration>
             </plugin>
@@ -61,6 +71,43 @@
                     </execution>
                 </executions>
             </plugin>
+            <plugin>
+                <artifactId>maven-resources-plugin</artifactId>
+                <version>3.0.1</version>
+                <executions>
+                    <execution>
+                        <id>copy-scripts</id>
+                        <phase>package</phase>
+                        <goals>
+                            <goal>copy-resources</goal>
+                        </goals>
+                        <configuration>
+                            <outputDirectory>${dir.local}${file.separator}bin</outputDirectory>
+                            <resources>
+                                <resource>
+                                    <directory>bin</directory>
+                                    <!-- This plugin doesn't preserve permissions, so the scripts aren't executable-->
+                                </resource>
+                            </resources>
+                        </configuration>
+                    </execution>
+                    <execution>
+                        <id>copy-conf</id>
+                        <phase>package</phase>
+                        <goals>
+                            <goal>copy-resources</goal>
+                        </goals>
+                        <configuration>
+                            <outputDirectory>${dir.local}${file.separator}conf</outputDirectory>
+                            <resources>
+                                <resource>
+                                    <directory>conf</directory>
+                                </resource>
+                            </resources>
+                        </configuration>
+                    </execution>
+                </executions>
+            </plugin>
         </plugins>
     </build>
     <dependencies>

http://git-wip-us.apache.org/repos/asf/nutch/blob/020f581a/src/bin/crawl
----------------------------------------------------------------------
diff --git a/src/bin/crawl b/src/bin/crawl
deleted file mode 100755
index 567d35e..0000000
--- a/src/bin/crawl
+++ /dev/null
@@ -1,281 +0,0 @@
-#!/bin/bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# Usage: crawl [-i|--index] [-D "key=value"] [-w|--wait] <Seed Dir> <Crawl Dir> <Num Rounds>
-#    -i|--index      Indexes crawl results into a configured indexer
-#    -w|--wait       NUMBER[SUFFIX] Time to wait before generating a new segment when no URLs
-#                    are scheduled for fetching. Suffix can be: s for second,
-#                    m for minute, h for hour and d for day. If no suffix is
-#                    specified second is used by default.
-#    -D              A Java property to pass to Nutch calls
-#    Seed Dir        Directory in which to look for a seeds file
-#    Crawl Dir       Directory where the crawl/link/segments dirs are saved
-#    Num Rounds      The number of rounds to run this crawl for
-#
-#
-# UNLIKE THE NUTCH ALL-IN-ONE-CRAWL COMMAND THIS SCRIPT DOES THE LINK INVERSION AND
-# INDEXING FOR EACH SEGMENT
-
-INDEXFLAG=false
-JAVA_PROPERTIES=""
-WAIT=-1 # don't wait if there are no URLs to fetch
-
-function __to_seconds() {
-  NUMBER=$(echo $1 | tr -dc '0-9')
-  MODIFIER=$(echo $1 | tr -dc '[^s|h|m|d]]')
-
-  case $MODIFIER in
-      m|M)
-        SECONDS=`expr $NUMBER \* 60`
-        ;;
-      h|H)
-        SECONDS=`expr $NUMBER \* 120`
-        ;;
-      d|D)
-        SECONDS=`expr $NUMBER \* 86400`
-        ;;
-      s|S|*)
-        SECONDS=$NUMBER
-        ;;
-  esac
-
-  echo $SECONDS
-}
-
-while [[ $# > 0 ]]
-do
-    case $1 in
-        -i|--index)
-            INDEXFLAG=true
-            shift
-            ;;
-        -D)
-            JAVA_PROPERTIES="-D${2} ${JAVA_PROPERTIES}"
-            shift 2
-            ;;
-        -w|--wait)
-            WAIT="${2}"
-            shift 2
-            ;;
-        *)
-            break
-            ;;
-    esac
-done
-
-if [[ $# != 3 ]]; then
-    echo "Usage: crawl [-i|--index] [-D \"key=value\"] [-w|--wait] <Seed Dir> <Crawl Dir> <Num Rounds>"
-    echo -e "\t-i|--index\tIndexes crawl results into a configured indexer"
-    echo -e "\t-D\t\tA Java property to pass to Nutch calls"
-    echo -e "\t-w|--wait\tNUMBER[SUFFIX] Time to wait before generating a new segment when no URLs"
-    echo -e "\t\t\tare scheduled for fetching. Suffix can be: s for second,"
-    echo -e "\t\t\tm for minute, h for hour and d for day. If no suffix is"
-    echo -e "\t\t\tspecified second is used by default."
-    echo -e "\tSeed Dir\tDirectory in which to look for a seeds file"
-    echo -e "\tCrawl Dir\tDirectory where the crawl/link/segments dirs are saved"
-    echo -e "\tNum Rounds\tThe number of rounds to run this crawl for"
-    exit 1
-fi
-
-SEEDDIR="$1"
-CRAWL_PATH="$2"
-LIMIT="$3"
-
-# convert wait time to seconds for compatibility reasons
-if [ "$WAIT" != "-1" ]; then
-  WAIT=$( __to_seconds "$WAIT" )
-  echo "Time to wait (--wait) = $WAIT sec."
-fi
-
-#############################################
-# MODIFY THE PARAMETERS BELOW TO YOUR NEEDS #
-#############################################
-
-# set the number of slaves nodes
-numSlaves=1
-
-# and the total number of available tasks
-# sets Hadoop parameter "mapreduce.job.reduces"
-numTasks=`expr $numSlaves \* 2`
-
-# number of urls to fetch in one iteration
-# 250K per task?
-sizeFetchlist=`expr $numSlaves \* 50000`
-
-# time limit for feching
-timeLimitFetch=180
-
-# num threads for fetching
-numThreads=50
-
-#############################################
-
-bin="`dirname "$0"`"
-bin="`cd "$bin"; pwd`"
-
-# determines whether mode based on presence of job file
-mode=local
-if [ -f "${bin}"/../*nutch*.job ]; then
-    mode=distributed
-fi
-
-# note that some of the options listed here could be set in the
-# corresponding hadoop site xml param file
-commonOptions="-D mapreduce.job.reduces=$numTasks -D mapred.child.java.opts=-Xmx1000m -D mapreduce.reduce.speculative=false -D mapreduce.map.speculative=false -D mapreduce.map.output.compress=true"
-
- # check that hadoop can be found on the path
-if [ $mode = "distributed" ]; then
- if [ $(which hadoop | wc -l ) -eq 0 ]; then
-    echo "Can't find Hadoop executable. Add HADOOP_COMMON_HOME/bin to the path or run in local mode."
-    exit -1;
- fi
-fi
-
-
-function __bin_nutch {
-    # run $bin/nutch, exit if exit value indicates error
-
-    echo "$bin/nutch $@" ;# echo command and arguments
-    "$bin/nutch" "$@"
-
-    RETCODE=$?
-    if [ $RETCODE -ne 0 ]
-    then
-        echo "Error running:"
-        echo "  $bin/nutch $@"
-        echo "Failed with exit value $RETCODE."
-        exit $RETCODE
-    fi
-}
-
-
-
-# initial injection
-echo "Injecting seed URLs"
-__bin_nutch inject "$CRAWL_PATH"/crawldb "$SEEDDIR"
-
-# main loop : rounds of generate - fetch - parse - update
-for ((a=1; ; a++))
-do
-  if [ -e ".STOP" ]
-  then
-   echo "STOP file found - escaping loop"
-   break
-  fi
-
-  if [ $LIMIT -ne -1 ]; then
-    if [ $a -gt $LIMIT ]; then
-      echo `date` ": Finished loop with $LIMIT iterations"
-      break
-    fi
-    echo `date` ": Iteration $a of $LIMIT"
-  else
-    echo `date` ": Iteration $a"
-  fi
-
-  echo "Generating a new segment"
-  generate_args=($commonOptions "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments -topN $sizeFetchlist -numFetchers $numSlaves -noFilter)
-  echo "$bin/nutch generate ${generate_args[@]}"
-  $bin/nutch generate "${generate_args[@]}"
-  RETCODE=$?
-  if [ $RETCODE -eq 0 ]; then
-      : # ok: no error
-  elif [ $RETCODE -eq 1 ]; then
-    echo "Generate returned 1 (no new segments created)"
-
-    if [ "$WAIT" -ne -1 ]; then
-      echo "Waiting for $WAIT sec. ..."
-      sleep $WAIT
-      continue
-    else
-      echo "Escaping loop: no more URLs to fetch now"
-      break
-    fi
-  else
-    echo "Error running:"
-    echo "  $bin/nutch generate ${generate_args[@]}"
-    echo "Failed with exit value $RETCODE."
-    exit $RETCODE
-  fi
-
-  # capture the name of the segment
-  # call hadoop in distributed mode
-  # or use ls
-
-  if [ $mode = "local" ]; then
-   SEGMENT=`ls "$CRAWL_PATH"/segments/ | sort -n | tail -n 1`
-  else
-   SEGMENT=`hadoop fs -ls "$CRAWL_PATH"/segments/ | grep segments |  sed -e "s/\//\\n/g" | egrep 20[0-9]+ | sort -n | tail -n 1`
-  fi
-
-  echo "Operating on segment : $SEGMENT"
-
-  # fetching the segment
-  echo "Fetching : $SEGMENT"
-  __bin_nutch fetch $commonOptions -D fetcher.timelimit.mins=$timeLimitFetch "$CRAWL_PATH"/segments/$SEGMENT -noParsing -threads $numThreads
-
-  # parsing the segment
-  echo "Parsing : $SEGMENT"
-  # enable the skipping of records for the parsing so that a dodgy document
-  # so that it does not fail the full task
-  skipRecordsOptions="-D mapreduce.task.skip.start.attempts=2 -D mapreduce.map.skip.maxrecords=1"
-  __bin_nutch parse $commonOptions $skipRecordsOptions "$CRAWL_PATH"/segments/$SEGMENT
-
-  # updatedb with this segment
-  echo "CrawlDB update"
-  __bin_nutch updatedb $commonOptions "$CRAWL_PATH"/crawldb  "$CRAWL_PATH"/segments/$SEGMENT
-
-# note that the link inversion - indexing routine can be done within the main loop
-# on a per segment basis
-  echo "Link inversion"
-  __bin_nutch invertlinks "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT
-
-  echo "Dedup on crawldb"
-  __bin_nutch dedup "$CRAWL_PATH"/crawldb
-
-  if $INDEXFLAG; then
-      echo "Indexing $SEGMENT to index"
-      __bin_nutch index $JAVA_PROPERTIES "$CRAWL_PATH"/crawldb -linkdb "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT
-
-      echo "Cleaning up index if possible"
-      __bin_nutch clean $JAVA_PROPERTIES "$CRAWL_PATH"/crawldb
-  else
-      echo "Skipping indexing ..."
-  fi
-
-  #######################################################
-  # The following commands fall into WebGraph territory
-  # and should be uncommented based on your requirements
-  #######################################################
-  #echo "Building WebGraph within $CRAWL_PATH on all segments in $CRAWL_PATH/segments/"
-  #__bin_nutch webgraph $commonOptions -filter -normalize -segmentDir "$CRAWL_PATH"/segments/ -webgraphdb "$CRAWL_PATH"
-
-  #echo "Running Loops Job on WebGraph within $CRAWL_PATH"
-  #__bin_nutch org.apache.nutch.scoring.webgraph.Loops $commonOptions -webgraphdb "$CRAWL_PATH"
-
-  #echo "Running LinkRank Algorithm on WebGraph within $CRAWL_PATH"
-  #__bin_nutch linkrank $commonOptions -webgraphdb "$CRAWL_PATH"
-
-  #echo "Running ScoreUpdater Job with $CRAWL_PATH/crawldb and  WebGraph within $CRAWL_PATH"
-  #__bin_nutch scoreupdater $commonOptions -crawldb "$CRAWL_PATH"/crawldb -webgraphdb "$CRAWL_PATH"
-
-  #echo "Running NodeDumper on WebGraph within $CRAWL_PATH and dumping output to $CRAWL_PATH/dump/scores"
-  #__bin_nutch nodedumper $commonOptions -scores -topn 1000 -webgraphdb "$CRAWL_PATH" -output "$CRAWL_PATH"/dump/scores
-
-done
-
-exit 0

http://git-wip-us.apache.org/repos/asf/nutch/blob/020f581a/src/bin/nutch
----------------------------------------------------------------------
diff --git a/src/bin/nutch b/src/bin/nutch
deleted file mode 100755
index 1649069..0000000
--- a/src/bin/nutch
+++ /dev/null
@@ -1,324 +0,0 @@
-#!/bin/bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# 
-# The Nutch command script
-#
-# Environment Variables
-#
-#   NUTCH_JAVA_HOME The java implementation to use.  Overrides JAVA_HOME.
-#
-#   NUTCH_HEAPSIZE  The maximum amount of heap to use, in MB. 
-#                   Default is 1000.
-#
-#   NUTCH_OPTS      Extra Java runtime options.
-#                   Multiple options must be separated by white space.
-#
-#   NUTCH_LOG_DIR   Log directory (default: $NUTCH_HOME/logs)
-#
-#   NUTCH_LOGFILE   Log file (default: hadoop.log)
-#
-#   NUTCH_CONF_DIR  Path(s) to configuration files (default: $NUTCH_HOME/conf).
-#                   Multiple paths must be separated by a colon ':'.
-#
-cygwin=false
-case "`uname`" in
-CYGWIN*) cygwin=true;;
-esac
-
-# resolve links - $0 may be a softlink
-THIS="$0"
-while [ -h "$THIS" ]; do
-  ls=`ls -ld "$THIS"`
-  link=`expr "$ls" : '.*-> \(.*\)$'`
-  if expr "$link" : '.*/.*' > /dev/null; then
-    THIS="$link"
-  else
-    THIS=`dirname "$THIS"`/"$link"
-  fi
-done
-
-# if no args specified, show usage
-if [ $# = 0 ]; then
-  echo "nutch 1.12"
-  echo "Usage: nutch COMMAND"
-  echo "where COMMAND is one of:"
-  echo "  readdb            read / dump crawl db"
-  echo "  mergedb           merge crawldb-s, with optional filtering"
-  echo "  readlinkdb        read / dump link db"
-  echo "  inject            inject new urls into the database"
-  echo "  generate          generate new segments to fetch from crawl db"
-  echo "  freegen           generate new segments to fetch from text files"
-  echo "  fetch             fetch a segment's pages"
-  echo "  parse             parse a segment's pages"
-  echo "  readseg           read / dump segment data"
-  echo "  mergesegs         merge several segments, with optional filtering and slicing"
-  echo "  updatedb          update crawl db from segments after fetching"
-  echo "  invertlinks       create a linkdb from parsed segments"
-  echo "  mergelinkdb       merge linkdb-s, with optional filtering"
-  echo "  index             run the plugin-based indexer on parsed segments and linkdb"
-  echo "  dedup             deduplicate entries in the crawldb and give them a special status"
-  echo "  dump              exports crawled data from segments into files"
-  echo "  commoncrawldump   exports crawled data from segments into common crawl data format encoded as CBOR"
-  echo "  solrindex         run the solr indexer on parsed segments and linkdb - DEPRECATED use the index command instead"
-  echo "  solrdedup         remove duplicates from solr - DEPRECATED use the dedup command instead"
-  echo "  solrclean         remove HTTP 301 and 404 documents from solr - DEPRECATED use the clean command instead"
-  echo "  clean             remove HTTP 301 and 404 documents and duplicates from indexing backends configured via plugins"
-  echo "  parsechecker      check the parser for a given url"
-  echo "  indexchecker      check the indexing filters for a given url"
-  echo "  filterchecker     check url filters for a given url"
-  echo "  normalizerchecker check url normalizers for a given url"
-  echo "  domainstats       calculate domain statistics from crawldb"
-  echo "  protocolstats     calculate protocol status code stats from crawldb"
-  echo "  crawlcomplete     calculate crawl completion stats from crawldb"
-  echo "  webgraph          generate a web graph from existing segments"
-  echo "  linkrank          run a link analysis program on the generated web graph"
-  echo "  scoreupdater      updates the crawldb with linkrank scores"
-  echo "  nodedumper        dumps the web graph's node scores"
-  echo "  plugin            load a plugin and run one of its classes main()"
-  echo "  junit             runs the given JUnit test"
-  echo "  startserver       runs the Nutch Server on localhost:8081"
-  echo "  webapp            run a local Nutch Web Application on locahost:8080"
-  echo "  warc              exports crawled data from segments at the WARC format"
-  echo "  updatehostdb      update the host db with records from the crawl db"
-  echo "  readhostdb        read / dump host db"
-  echo " or"
-  echo "  CLASSNAME         run the class named CLASSNAME"
-  echo "Most commands print help when invoked w/o parameters."
-  exit 1
-fi
-
-# get arguments
-COMMAND=$1
-shift
-
-# some directories
-THIS_DIR="`dirname "$THIS"`"
-NUTCH_HOME="`cd "$THIS_DIR/.." ; pwd`"
-
-# some Java parameters
-if [ "$NUTCH_JAVA_HOME" != "" ]; then
-  #echo "run java in $NUTCH_JAVA_HOME"
-  JAVA_HOME="$NUTCH_JAVA_HOME"
-fi
-  
-if [ "$JAVA_HOME" = "" ]; then
-  echo "Error: JAVA_HOME is not set."
-  exit 1
-fi
-
-local=true
-
-# NUTCH_JOB 
-if [ -f "${NUTCH_HOME}"/*nutch*.job ]; then
-  local=false
-  for f in "$NUTCH_HOME"/*nutch*.job; do
-    NUTCH_JOB="$f"
-  done
-  # cygwin path translation
-  if $cygwin; then
-	NUTCH_JOB="`cygpath -p -w "$NUTCH_JOB"`"
-  fi
-fi
-
-JAVA="$JAVA_HOME/bin/java"
-JAVA_HEAP_MAX=-Xmx1000m 
-
-# check envvars which might override default args
-if [ "$NUTCH_HEAPSIZE" != "" ]; then
-  #echo "run with heapsize $NUTCH_HEAPSIZE"
-  JAVA_HEAP_MAX="-Xmx""$NUTCH_HEAPSIZE""m"
-  #echo $JAVA_HEAP_MAX
-fi
-
-# CLASSPATH initially contains $NUTCH_CONF_DIR, or defaults to $NUTCH_HOME/conf
-CLASSPATH="${NUTCH_CONF_DIR:=$NUTCH_HOME/conf}"
-CLASSPATH="${CLASSPATH}:$JAVA_HOME/lib/tools.jar"
-
-# so that filenames w/ spaces are handled correctly in loops below
-IFS=
-
-# add libs to CLASSPATH
-if $local; then
-  for f in "$NUTCH_HOME"/lib/*.jar; do
-   CLASSPATH="${CLASSPATH}:$f";
-  done
-  # local runtime
-  # add plugins to classpath
-  if [ -d "$NUTCH_HOME/plugins" ]; then
-     CLASSPATH="${NUTCH_HOME}:${CLASSPATH}"
-  fi
-fi
-
-# cygwin path translation
-if $cygwin; then
-  CLASSPATH="`cygpath -p -w "$CLASSPATH"`"
-fi
-
-# setup 'java.library.path' for native-hadoop code if necessary
-# used only in local mode 
-JAVA_LIBRARY_PATH=''
-if [ -d "${NUTCH_HOME}/lib/native" ]; then
-
-  JAVA_PLATFORM=`"${JAVA}" -classpath "$CLASSPATH" org.apache.hadoop.util.PlatformName | sed -e 's/ /_/g'`
-
-  if [ -d "${NUTCH_HOME}/lib/native" ]; then
-    if [ "x$JAVA_LIBRARY_PATH" != "x" ]; then
-      JAVA_LIBRARY_PATH="${JAVA_LIBRARY_PATH}:${NUTCH_HOME}/lib/native/${JAVA_PLATFORM}"
-    else
-      JAVA_LIBRARY_PATH="${NUTCH_HOME}/lib/native/${JAVA_PLATFORM}"
-    fi
-  fi
-fi
-
-if [ $cygwin = true -a "X${JAVA_LIBRARY_PATH}" != "X" ]; then
-  JAVA_LIBRARY_PATH="`cygpath -p -w "$JAVA_LIBRARY_PATH"`"
-fi
-
-# restore ordinary behaviour
-unset IFS
-
-# default log directory & file
-if [ "$NUTCH_LOG_DIR" = "" ]; then
-  NUTCH_LOG_DIR="$NUTCH_HOME/logs"
-fi
-if [ "$NUTCH_LOGFILE" = "" ]; then
-  NUTCH_LOGFILE='hadoop.log'
-fi
-
-#Fix log path under cygwin
-if $cygwin; then
-  NUTCH_LOG_DIR="`cygpath -p -w "$NUTCH_LOG_DIR"`"
-fi
-
-NUTCH_OPTS=($NUTCH_OPTS -Dhadoop.log.dir="$NUTCH_LOG_DIR")
-NUTCH_OPTS=("${NUTCH_OPTS[@]}" -Dhadoop.log.file="$NUTCH_LOGFILE")
-
-if [ "x$JAVA_LIBRARY_PATH" != "x" ]; then
-  NUTCH_OPTS=("${NUTCH_OPTS[@]}" -Djava.library.path="$JAVA_LIBRARY_PATH")
-fi
-
-# figure out which class to run
-if [ "$COMMAND" = "crawl" ] ; then
-  echo "Command $COMMAND is deprecated, please use bin/crawl instead"
-  exit -1
-elif [ "$COMMAND" = "inject" ] ; then
-  CLASS=org.apache.nutch.crawl.Injector
-elif [ "$COMMAND" = "generate" ] ; then
-  CLASS=org.apache.nutch.crawl.Generator
-elif [ "$COMMAND" = "freegen" ] ; then
-  CLASS=org.apache.nutch.tools.FreeGenerator
-elif [ "$COMMAND" = "fetch" ] ; then
-  CLASS=org.apache.nutch.fetcher.Fetcher
-elif [ "$COMMAND" = "parse" ] ; then
-  CLASS=org.apache.nutch.parse.ParseSegment
-elif [ "$COMMAND" = "readdb" ] ; then
-  CLASS=org.apache.nutch.crawl.CrawlDbReader
-elif [ "$COMMAND" = "mergedb" ] ; then
-  CLASS=org.apache.nutch.crawl.CrawlDbMerger
-elif [ "$COMMAND" = "readlinkdb" ] ; then
-  CLASS=org.apache.nutch.crawl.LinkDbReader
-elif [ "$COMMAND" = "readseg" ] ; then
-  CLASS=org.apache.nutch.segment.SegmentReader
-elif [ "$COMMAND" = "mergesegs" ] ; then
-  CLASS=org.apache.nutch.segment.SegmentMerger
-elif [ "$COMMAND" = "updatedb" ] ; then
-  CLASS=org.apache.nutch.crawl.CrawlDb
-elif [ "$COMMAND" = "invertlinks" ] ; then
-  CLASS=org.apache.nutch.crawl.LinkDb
-elif [ "$COMMAND" = "mergelinkdb" ] ; then
-  CLASS=org.apache.nutch.crawl.LinkDbMerger
-elif [ "$COMMAND" = "dump" ] ; then
-  CLASS=org.apache.nutch.tools.FileDumper
-elif [ "$COMMAND" = "commoncrawldump" ] ; then
-  CLASS=org.apache.nutch.tools.CommonCrawlDataDumper
-elif [ "$COMMAND" = "solrindex" ] ; then
-  CLASS="org.apache.nutch.indexer.IndexingJob -D solr.server.url=$1"
-  shift
-elif [ "$COMMAND" = "index" ] ; then
-  CLASS=org.apache.nutch.indexer.IndexingJob
-elif [ "$COMMAND" = "solrdedup" ] ; then
-  echo "Command $COMMAND is deprecated, please use dedup instead"
-  exit -1
-elif [ "$COMMAND" = "dedup" ] ; then
-  CLASS=org.apache.nutch.crawl.DeduplicationJob
-elif [ "$COMMAND" = "solrclean" ] ; then
-  CLASS="org.apache.nutch.indexer.CleaningJob -D solr.server.url=$2 $1"
-  shift; shift
-elif [ "$COMMAND" = "clean" ] ; then
-  CLASS=org.apache.nutch.indexer.CleaningJob
-elif [ "$COMMAND" = "parsechecker" ] ; then
-  CLASS=org.apache.nutch.parse.ParserChecker
-elif [ "$COMMAND" = "indexchecker" ] ; then
-  CLASS=org.apache.nutch.indexer.IndexingFiltersChecker
-elif [ "$COMMAND" = "filterchecker" ] ; then
-  CLASS=org.apache.nutch.net.URLFilterChecker
-elif [ "$COMMAND" = "normalizerchecker" ] ; then
-  CLASS=org.apache.nutch.net.URLNormalizerChecker
-elif [ "$COMMAND" = "domainstats" ] ; then 
-  CLASS=org.apache.nutch.util.domain.DomainStatistics
-elif [ "$COMMAND" = "protocolstats" ] ; then
-   CLASS=org.apache.nutch.util.ProtocolStatusStatistics
-elif [ "$COMMAND" = "crawlcomplete" ] ; then
-  CLASS=org.apache.nutch.util.CrawlCompletionStats
-elif [ "$COMMAND" = "webgraph" ] ; then
-  CLASS=org.apache.nutch.scoring.webgraph.WebGraph
-elif [ "$COMMAND" = "linkrank" ] ; then
-  CLASS=org.apache.nutch.scoring.webgraph.LinkRank
-elif [ "$COMMAND" = "scoreupdater" ] ; then
-  CLASS=org.apache.nutch.scoring.webgraph.ScoreUpdater
-elif [ "$COMMAND" = "nodedumper" ] ; then
-  CLASS=org.apache.nutch.scoring.webgraph.NodeDumper
-elif [ "$COMMAND" = "plugin" ] ; then
-  CLASS=org.apache.nutch.plugin.PluginRepository
-elif [ "$COMMAND" = "junit" ] ; then
-  CLASSPATH="$CLASSPATH:$NUTCH_HOME/test/classes/"
-  if $local; then
-    for f in "$NUTCH_HOME"/test/lib/*.jar; do
-      CLASSPATH="${CLASSPATH}:$f";
-    done
-  fi
-  CLASS=org.junit.runner.JUnitCore
-elif [ "$COMMAND" = "startserver" ] ; then
-  CLASS=org.apache.nutch.service.NutchServer
-elif [ "$COMMAND" = "webapp" ] ; then
-  CLASS=org.apache.nutch.webui.NutchUiServer
-elif [ "$COMMAND" = "warc" ] ; then
-  CLASS=org.apache.nutch.tools.warc.WARCExporter
-elif [ "$COMMAND" = "updatehostdb" ] ; then
-  CLASS=org.apache.nutch.hostdb.UpdateHostDb
-elif [ "$COMMAND" = "readhostdb" ] ; then
-  CLASS=org.apache.nutch.hostdb.ReadHostDb
-else
-  CLASS=$COMMAND
-fi
-
-# distributed mode
-EXEC_CALL=(hadoop jar "$NUTCH_JOB")
-
-if $local; then
- EXEC_CALL=("$JAVA" $JAVA_HEAP_MAX "${NUTCH_OPTS[@]}" -classpath "$CLASSPATH")
-else
- # check that hadoop can be found on the path
- if [ $(which hadoop | wc -l ) -eq 0 ]; then
-    echo "Can't find Hadoop executable. Add HADOOP_COMMON_HOME/bin to the path or run in local mode."
-    exit -1;
- fi
-fi
-
-# run it
-exec "${EXEC_CALL[@]}" $CLASS "$@"
-

[2/2] nutch git commit: Convert tests which requires plugin.folders system prop to integration Tests

Posted by th...@apache.org.

Convert tests which requires plugin.folders system prop to integration Tests


Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/9f3ba3ed
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/9f3ba3ed
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/9f3ba3ed

Branch: refs/heads/NUTCH-2292
Commit: 9f3ba3eda59219eabe7020f2c65b505dbc46d947
Parents: 020f581
Author: Thamme Gowda <th...@apache.org>
Authored: Sun Jul 10 19:07:57 2016 -0700
Committer: Thamme Gowda <th...@apache.org>
Committed: Sun Jul 10 19:07:57 2016 -0700

----------------------------------------------------------------------
 nutch-core/pom.xml                              | 22 ++++++++++++
 .../nutch/crawl/TODOTestCrawlDbStates.java      |  3 ++
 .../apache/nutch/crawl/TestCrawlDbFilter.java   |  3 ++
 .../apache/nutch/crawl/TestCrawlDbMerger.java   |  3 ++
 .../apache/nutch/crawl/TestCrawlDbStates.java   |  3 ++
 .../org/apache/nutch/crawl/TestGenerator.java   |  3 ++
 .../org/apache/nutch/crawl/TestInjector.java    |  3 ++
 .../org/apache/nutch/fetcher/TestFetcher.java   |  3 ++
 .../nutch/indexer/TestIndexerMapReduce.java     |  3 ++
 .../nutch/indexer/TestIndexingFilters.java      |  3 ++
 .../org/apache/nutch/net/TestURLFilters.java    |  3 ++
 .../apache/nutch/net/TestURLNormalizers.java    |  3 ++
 .../apache/nutch/parse/TestParserFactory.java   |  3 ++
 .../apache/nutch/plugin/TestPluginSystem.java   |  3 ++
 .../nutch/protocol/TestProtocolFactory.java     |  3 ++
 .../nutch/tools/TestCommonCrawlDataDumper.java  |  5 +--
 .../org/apache/nutch/util/TestMimeUtil.java     | 12 +++++--
 nutch-plugins/pom.xml                           |  9 +++++
 pom.xml                                         | 36 ++++++++++++++++++++
 19 files changed, 122 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/nutch/blob/9f3ba3ed/nutch-core/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-core/pom.xml b/nutch-core/pom.xml
index ad5c3af..62e2e58 100644
--- a/nutch-core/pom.xml
+++ b/nutch-core/pom.xml
@@ -456,6 +456,11 @@
     </dependencies>
 
     <build>
+        <resources>
+            <resource>
+                <directory>${project.parent.basedir}${file.separator}conf</directory>
+            </resource>
+        </resources>
         <plugins>
             <plugin>
                 <groupId>org.apache.maven.plugins</groupId>
@@ -495,6 +500,23 @@
                     </execution>
                 </executions>
             </plugin>
+            <plugin>
+                <artifactId>maven-surefire-plugin</artifactId>
+                <version>2.19.1</version>
+                <configuration>
+                    <excludedGroups>org.apache.nutch.test.IntegrationTest</excludedGroups>
+                </configuration>
+            </plugin>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-failsafe-plugin</artifactId>
+                <version>2.19.1</version>
+                <configuration>
+                    <systemPropertyVariables>
+                        <plugin.folders>../runtime/local/plugins</plugin.folders>
+                    </systemPropertyVariables>
+                </configuration>
+            </plugin>
         </plugins>
     </build>
 </project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/9f3ba3ed/nutch-core/src/test/java/org/apache/nutch/crawl/TODOTestCrawlDbStates.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/crawl/TODOTestCrawlDbStates.java b/nutch-core/src/test/java/org/apache/nutch/crawl/TODOTestCrawlDbStates.java
index e44cb39..fd88c7d 100644
--- a/nutch-core/src/test/java/org/apache/nutch/crawl/TODOTestCrawlDbStates.java
+++ b/nutch-core/src/test/java/org/apache/nutch/crawl/TODOTestCrawlDbStates.java
@@ -3,14 +3,17 @@ package org.apache.nutch.crawl;
 import static org.apache.nutch.crawl.CrawlDatum.*;
 
 import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.test.IntegrationTest;
 import org.apache.nutch.util.TimingUtil;
 
 import static org.junit.Assert.*;
 
 import org.junit.Test;
+import org.junit.experimental.categories.Category;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+@Category({ IntegrationTest.class})
 public class TODOTestCrawlDbStates extends TestCrawlDbStates {
 
   private static final Logger LOG = LoggerFactory

http://git-wip-us.apache.org/repos/asf/nutch/blob/9f3ba3ed/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbFilter.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbFilter.java b/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbFilter.java
index 38c38ed..773dd29 100644
--- a/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbFilter.java
+++ b/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbFilter.java
@@ -28,11 +28,13 @@ import org.apache.hadoop.io.SequenceFile.Reader.Option;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapred.*;
 import org.apache.nutch.crawl.CrawlDBTestUtil.URLCrawlDatum;
+import org.apache.nutch.test.IntegrationTest;
 import org.apache.nutch.util.NutchJob;
 import org.junit.After;
 import org.junit.Assert;
 import org.junit.Before;
 import org.junit.Test;
+import org.junit.experimental.categories.Category;
 
 /**
  * CrawlDbFiltering test which tests for correct, error free url normalization
@@ -73,6 +75,7 @@ public class TestCrawlDbFilter {
    * @throws Exception
    */
   @Test
+  @Category({IntegrationTest.class})
   public void testUrl404Purging() throws Exception {
     // create a CrawlDatum with DB GONE status
     ArrayList<URLCrawlDatum> list = new ArrayList<URLCrawlDatum>();

http://git-wip-us.apache.org/repos/asf/nutch/blob/9f3ba3ed/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbMerger.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbMerger.java b/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbMerger.java
index b670551..599c353 100644
--- a/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbMerger.java
+++ b/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbMerger.java
@@ -30,11 +30,13 @@ import org.apache.hadoop.io.SequenceFile;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.MapFile.Writer.Option;
 import org.apache.hadoop.mapred.JobConf;
+import org.apache.nutch.test.IntegrationTest;
 import org.apache.nutch.util.NutchConfiguration;
 import org.junit.After;
 import org.junit.Assert;
 import org.junit.Before;
 import org.junit.Test;
+import org.junit.experimental.categories.Category;
 
 public class TestCrawlDbMerger {
   private static final Logger LOG = Logger.getLogger(CrawlDbMerger.class
@@ -110,6 +112,7 @@ public class TestCrawlDbMerger {
    * @throws Exception
    */
   @Test
+  @Category({IntegrationTest.class})
   public void testMerge() throws Exception {
     Path crawldb1 = new Path(testDir, "crawldb1");
     Path crawldb2 = new Path(testDir, "crawldb2");

http://git-wip-us.apache.org/repos/asf/nutch/blob/9f3ba3ed/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbStates.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbStates.java b/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbStates.java
index c54559b..b631319 100644
--- a/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbStates.java
+++ b/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbStates.java
@@ -35,8 +35,10 @@ import org.apache.nutch.scoring.ScoringFilters;
 
 import static org.junit.Assert.*;
 
+import org.apache.nutch.test.IntegrationTest;
 import org.junit.Test;
 
+import org.junit.experimental.categories.Category;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -57,6 +59,7 @@ import org.slf4j.LoggerFactory;
  * </ul>
  * </li> </ul>
  */
+@Category({IntegrationTest.class})
 public class TestCrawlDbStates {
 
   private static final Logger LOG = LoggerFactory

http://git-wip-us.apache.org/repos/asf/nutch/blob/9f3ba3ed/nutch-core/src/test/java/org/apache/nutch/crawl/TestGenerator.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/crawl/TestGenerator.java b/nutch-core/src/test/java/org/apache/nutch/crawl/TestGenerator.java
index 84e6b28..0ce3c5f 100644
--- a/nutch-core/src/test/java/org/apache/nutch/crawl/TestGenerator.java
+++ b/nutch-core/src/test/java/org/apache/nutch/crawl/TestGenerator.java
@@ -28,10 +28,12 @@ import org.apache.hadoop.io.SequenceFile;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.SequenceFile.Reader.Option;
 import org.apache.nutch.crawl.CrawlDBTestUtil.URLCrawlDatum;
+import org.apache.nutch.test.IntegrationTest;
 import org.junit.After;
 import org.junit.Assert;
 import org.junit.Before;
 import org.junit.Test;
+import org.junit.experimental.categories.Category;
 
 /**
  * Basic generator test. 1. Insert entries in crawldb 2. Generates entries to
@@ -39,6 +41,7 @@ import org.junit.Test;
  * highest scoring urls are generated
  * 
  */
+@Category({IntegrationTest.class})
 public class TestGenerator {
 
   Configuration conf;

http://git-wip-us.apache.org/repos/asf/nutch/blob/9f3ba3ed/nutch-core/src/test/java/org/apache/nutch/crawl/TestInjector.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/crawl/TestInjector.java b/nutch-core/src/test/java/org/apache/nutch/crawl/TestInjector.java
index 7293cbb..59a3e8c 100644
--- a/nutch-core/src/test/java/org/apache/nutch/crawl/TestInjector.java
+++ b/nutch-core/src/test/java/org/apache/nutch/crawl/TestInjector.java
@@ -29,10 +29,12 @@ import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.SequenceFile;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.SequenceFile.Reader.Option;
+import org.apache.nutch.test.IntegrationTest;
 import org.junit.After;
 import org.junit.Assert;
 import org.junit.Before;
 import org.junit.Test;
+import org.junit.experimental.categories.Category;
 
 /**
  * Basic injector test: 1. Creates a text file with urls 2. Injects them into
@@ -40,6 +42,7 @@ import org.junit.Test;
  * into webdb 5. Reads crawldb entries and verifies contents
  * 
  */
+@Category({IntegrationTest.class})
 public class TestInjector {
 
   private Configuration conf;

http://git-wip-us.apache.org/repos/asf/nutch/blob/9f3ba3ed/nutch-core/src/test/java/org/apache/nutch/fetcher/TestFetcher.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/fetcher/TestFetcher.java b/nutch-core/src/test/java/org/apache/nutch/fetcher/TestFetcher.java
index fae5f90..a23d080 100644
--- a/nutch-core/src/test/java/org/apache/nutch/fetcher/TestFetcher.java
+++ b/nutch-core/src/test/java/org/apache/nutch/fetcher/TestFetcher.java
@@ -32,10 +32,12 @@ import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.metadata.Nutch;
 import org.apache.nutch.parse.ParseData;
 import org.apache.nutch.protocol.Content;
+import org.apache.nutch.test.IntegrationTest;
 import org.junit.After;
 import org.junit.Assert;
 import org.junit.Before;
 import org.junit.Test;
+import org.junit.experimental.categories.Category;
 import org.mortbay.jetty.Server;
 
 /**
@@ -79,6 +81,7 @@ public class TestFetcher {
   }
 
   @Test
+  @Category(IntegrationTest.class)
   public void testFetch() throws IOException, ClassNotFoundException, InterruptedException {
 
     // generate seedlist

http://git-wip-us.apache.org/repos/asf/nutch/blob/9f3ba3ed/nutch-core/src/test/java/org/apache/nutch/indexer/TestIndexerMapReduce.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/indexer/TestIndexerMapReduce.java b/nutch-core/src/test/java/org/apache/nutch/indexer/TestIndexerMapReduce.java
index d581a0f..3a25f26 100644
--- a/nutch-core/src/test/java/org/apache/nutch/indexer/TestIndexerMapReduce.java
+++ b/nutch-core/src/test/java/org/apache/nutch/indexer/TestIndexerMapReduce.java
@@ -30,8 +30,10 @@ import org.apache.nutch.parse.ParseData;
 import org.apache.nutch.parse.ParseStatus;
 import org.apache.nutch.parse.ParseText;
 import org.apache.nutch.protocol.Content;
+import org.apache.nutch.test.IntegrationTest;
 import org.apache.nutch.util.NutchConfiguration;
 import org.junit.Test;
+import org.junit.experimental.categories.Category;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -99,6 +101,7 @@ public class TestIndexerMapReduce {
    * Test indexing of base64-encoded binary content.
    */
   @Test
+  @Category(IntegrationTest.class)
   public void testBinaryContentBase64() {
     configuration = NutchConfiguration.create();
     configuration.setBoolean(IndexerMapReduce.INDEXER_BINARY_AS_BASE64, true);

http://git-wip-us.apache.org/repos/asf/nutch/blob/9f3ba3ed/nutch-core/src/test/java/org/apache/nutch/indexer/TestIndexingFilters.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/indexer/TestIndexingFilters.java b/nutch-core/src/test/java/org/apache/nutch/indexer/TestIndexingFilters.java
index 4d5849f..14b246b 100644
--- a/nutch-core/src/test/java/org/apache/nutch/indexer/TestIndexingFilters.java
+++ b/nutch-core/src/test/java/org/apache/nutch/indexer/TestIndexingFilters.java
@@ -25,10 +25,13 @@ import org.apache.nutch.parse.Outlink;
 import org.apache.nutch.parse.ParseData;
 import org.apache.nutch.parse.ParseImpl;
 import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.test.IntegrationTest;
 import org.apache.nutch.util.NutchConfiguration;
 import org.junit.Assert;
 import org.junit.Test;
+import org.junit.experimental.categories.Category;
 
+@Category(IntegrationTest.class)
 public class TestIndexingFilters {
 
   /**

http://git-wip-us.apache.org/repos/asf/nutch/blob/9f3ba3ed/nutch-core/src/test/java/org/apache/nutch/net/TestURLFilters.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/net/TestURLFilters.java b/nutch-core/src/test/java/org/apache/nutch/net/TestURLFilters.java
index c43941a..ef07907 100644
--- a/nutch-core/src/test/java/org/apache/nutch/net/TestURLFilters.java
+++ b/nutch-core/src/test/java/org/apache/nutch/net/TestURLFilters.java
@@ -17,9 +17,12 @@
 package org.apache.nutch.net;
 
 import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.test.IntegrationTest;
 import org.apache.nutch.util.NutchConfiguration;
 import org.junit.Test;
+import org.junit.experimental.categories.Category;
 
+@Category(IntegrationTest.class)
 public class TestURLFilters {
 
   /**

http://git-wip-us.apache.org/repos/asf/nutch/blob/9f3ba3ed/nutch-core/src/test/java/org/apache/nutch/net/TestURLNormalizers.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/net/TestURLNormalizers.java b/nutch-core/src/test/java/org/apache/nutch/net/TestURLNormalizers.java
index 6fdbb9d..d29e9d3 100644
--- a/nutch-core/src/test/java/org/apache/nutch/net/TestURLNormalizers.java
+++ b/nutch-core/src/test/java/org/apache/nutch/net/TestURLNormalizers.java
@@ -19,10 +19,13 @@ package org.apache.nutch.net;
 import java.net.MalformedURLException;
 
 import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.test.IntegrationTest;
 import org.apache.nutch.util.NutchConfiguration;
 import org.junit.Assert;
 import org.junit.Test;
+import org.junit.experimental.categories.Category;
 
+@Category(IntegrationTest.class)
 public class TestURLNormalizers {
 
   @Test

http://git-wip-us.apache.org/repos/asf/nutch/blob/9f3ba3ed/nutch-core/src/test/java/org/apache/nutch/parse/TestParserFactory.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/parse/TestParserFactory.java b/nutch-core/src/test/java/org/apache/nutch/parse/TestParserFactory.java
index 00c524e..198e284 100644
--- a/nutch-core/src/test/java/org/apache/nutch/parse/TestParserFactory.java
+++ b/nutch-core/src/test/java/org/apache/nutch/parse/TestParserFactory.java
@@ -20,10 +20,12 @@ package org.apache.nutch.parse;
 // Nutch imports
 import org.apache.nutch.plugin.Extension;
 import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.test.IntegrationTest;
 import org.apache.nutch.util.NutchConfiguration;
 import org.junit.Assert;
 import org.junit.Before;
 import org.junit.Test;
+import org.junit.experimental.categories.Category;
 
 /**
  * Unit test for new parse plugin selection.
@@ -31,6 +33,7 @@ import org.junit.Test;
  * @author Sebastien Le Callonnec
  * @version 1.0
  */
+@Category(IntegrationTest.class)
 public class TestParserFactory {
 
   private Configuration conf;

http://git-wip-us.apache.org/repos/asf/nutch/blob/9f3ba3ed/nutch-core/src/test/java/org/apache/nutch/plugin/TestPluginSystem.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/plugin/TestPluginSystem.java b/nutch-core/src/test/java/org/apache/nutch/plugin/TestPluginSystem.java
index a5f4e32..7bcc9ab 100644
--- a/nutch-core/src/test/java/org/apache/nutch/plugin/TestPluginSystem.java
+++ b/nutch-core/src/test/java/org/apache/nutch/plugin/TestPluginSystem.java
@@ -28,16 +28,19 @@ import java.util.Properties;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.mapred.JobConf;
+import org.apache.nutch.test.IntegrationTest;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.NutchJob;
 import org.junit.After;
 import org.junit.Assert;
 import org.junit.Before;
 import org.junit.Test;
+import org.junit.experimental.categories.Category;
 
 /**
  * Unit tests for the plugin system
  */
+@Category(IntegrationTest.class)
 public class TestPluginSystem {
   private int fPluginCount;
 

http://git-wip-us.apache.org/repos/asf/nutch/blob/9f3ba3ed/nutch-core/src/test/java/org/apache/nutch/protocol/TestProtocolFactory.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/protocol/TestProtocolFactory.java b/nutch-core/src/test/java/org/apache/nutch/protocol/TestProtocolFactory.java
index 394c303..6b4c8fd 100644
--- a/nutch-core/src/test/java/org/apache/nutch/protocol/TestProtocolFactory.java
+++ b/nutch-core/src/test/java/org/apache/nutch/protocol/TestProtocolFactory.java
@@ -17,12 +17,15 @@
 package org.apache.nutch.protocol;
 
 import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.test.IntegrationTest;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.ObjectCache;
 import org.junit.Assert;
 import org.junit.Before;
 import org.junit.Test;
+import org.junit.experimental.categories.Category;
 
+@Category(IntegrationTest.class)
 public class TestProtocolFactory {
 
   Configuration conf;

http://git-wip-us.apache.org/repos/asf/nutch/blob/9f3ba3ed/nutch-core/src/test/java/org/apache/nutch/tools/TestCommonCrawlDataDumper.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/tools/TestCommonCrawlDataDumper.java b/nutch-core/src/test/java/org/apache/nutch/tools/TestCommonCrawlDataDumper.java
index 1429925..fef0e69 100644
--- a/nutch-core/src/test/java/org/apache/nutch/tools/TestCommonCrawlDataDumper.java
+++ b/nutch-core/src/test/java/org/apache/nutch/tools/TestCommonCrawlDataDumper.java
@@ -19,6 +19,8 @@ package org.apache.nutch.tools;
 
 //Junit imports
 import static org.junit.Assert.*;
+
+import org.apache.nutch.test.TestUtils;
 import org.junit.Test;
 
 //Commons imports
@@ -43,8 +45,7 @@ public class TestCommonCrawlDataDumper {
 
   @Test
   public void testDump() throws Exception {
-    File sampleSegmentDir = new File(System.getProperty("test.build.data",
-        "."), "test-segments");
+    File sampleSegmentDir = TestUtils.getFile(this, "test-segments");
     File tempDir = Files.createTempDirectory("temp").toFile();
 
     String[] crawledFiles = {

http://git-wip-us.apache.org/repos/asf/nutch/blob/9f3ba3ed/nutch-core/src/test/java/org/apache/nutch/util/TestMimeUtil.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/java/org/apache/nutch/util/TestMimeUtil.java b/nutch-core/src/test/java/org/apache/nutch/util/TestMimeUtil.java
index d0b45db..d812110 100644
--- a/nutch-core/src/test/java/org/apache/nutch/util/TestMimeUtil.java
+++ b/nutch-core/src/test/java/org/apache/nutch/util/TestMimeUtil.java
@@ -18,6 +18,7 @@
 package org.apache.nutch.util;
 
 import java.io.File;
+import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.nio.charset.Charset;
 
@@ -26,6 +27,7 @@ import org.apache.hadoop.conf.Configuration;
 import com.google.common.io.Files;
 
 import junit.framework.TestCase;
+import org.apache.nutch.test.TestUtils;
 
 public class TestMimeUtil extends TestCase {
 
@@ -33,8 +35,14 @@ public class TestMimeUtil extends TestCase {
 
   private static Charset defaultCharset = Charset.forName("UTF-8");
 
-  private File sampleDir = new File(System.getProperty("test.build.data", "."),
-      "test-mime-util");
+  private File sampleDir;
+  {
+    try {
+      sampleDir = TestUtils.getFile(this, "test-mime-util");
+    } catch (FileNotFoundException e){
+      throw new RuntimeException(e);
+    }
+  }
 
   /**
    * test data, every element on "test page":

http://git-wip-us.apache.org/repos/asf/nutch/blob/9f3ba3ed/nutch-plugins/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/pom.xml b/nutch-plugins/pom.xml
index fa7adb7..e07f487 100644
--- a/nutch-plugins/pom.xml
+++ b/nutch-plugins/pom.xml
@@ -150,6 +150,15 @@
                     </execution>
                 </executions>
             </plugin>
+            <plugin>
+                <artifactId>maven-surefire-plugin</artifactId>
+                <version>2.19.1</version>
+            </plugin>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-failsafe-plugin</artifactId>
+                <version>2.19.1</version>
+            </plugin>
         </plugins>
     </build>
 </project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/9f3ba3ed/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index 18e22c7..a3b9271 100644
--- a/pom.xml
+++ b/pom.xml
@@ -109,6 +109,42 @@
                 </executions>
             </plugin>
         </plugins>
+        <pluginManagement>
+            <plugins>
+                <plugin>
+                    <!-- SureFire is for unit tests, here we exclude integration test group-->
+                    <artifactId>maven-surefire-plugin</artifactId>
+                    <version>2.19.1</version>
+                    <configuration>
+                        <excludedGroups>org.apache.nutch.test.IntegrationTest</excludedGroups>
+                    </configuration>
+                </plugin>
+                <plugin>
+                    <!-- FailSafe is for integration tests, here we just run integration tests-->
+                    <groupId>org.apache.maven.plugins</groupId>
+                    <artifactId>maven-failsafe-plugin</artifactId>
+                    <version>2.19.1</version>
+                    <configuration>
+                        <systemPropertyVariables>
+                            <plugin.folders>${dir.local.plugins}</plugin.folders>
+                        </systemPropertyVariables>
+                        <includes>
+                            <include>**/*.java</include>
+                        </includes>
+                        <groups>org.apache.nutch.test.IntegrationTest</groups>
+                    </configuration>
+                    <executions>
+                        <execution>
+                            <id>integration-test</id>
+                            <goals>
+                                <goal>integration-test</goal>
+                                <goal>verify</goal>
+                            </goals>
+                        </execution>
+                    </executions>
+                </plugin>
+            </plugins>
+        </pluginManagement>
     </build>
     <dependencies>
         <dependency>