You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by Apache Wiki <wi...@apache.org> on 2006/07/21 21:09:22 UTC
[Nutch Wiki] Update of "IntranetRecrawl" by MatthewHolt
Dear Wiki user,
You have subscribed to a wiki page or wiki category on "Nutch Wiki" for change notification.
The following page has been changed by MatthewHolt:
http://wiki.apache.org/nutch/IntranetRecrawl
New page:
[[TableOfContents]]
Here are a couple of scripts for recrawling your Intranet.
= Version 0.7.2 =
Place in the main nutch directory and run.
== Example Usage ==
./recrawl crawl 10 31
== Script ==
#!/bin/bash
# A simple script to run a Nutch re-crawl
if [ -n "$1" ]
then
crawl_dir=$1
else
echo "Usage: recrawl crawl_dir [depth] [adddays]"
exit 1
fi
if [ -n "$2" ]
then
depth=$2
else
depth=5
fi
if [ -n "$3" ]
then
adddays=$3
else
adddays=0
fi
webdb_dir=$crawl_dir/db
segments_dir=$crawl_dir/segments
index_dir=$crawl_dir/index
# The generate/fetch/update cycle
for ((i=1; i <= depth ; i++))
do
bin/nutch generate $webdb_dir $segments_dir -adddays $adddays
segment=`ls -d $segments_dir/* | tail -1`
bin/nutch fetch $segment
bin/nutch updatedb $webdb_dir $segment
done
# Update segments
mkdir tmp
bin/nutch updatesegs $webdb_dir $segments_dir tmp
rm -R tmp
# Index segments
for segment in `ls -d $segments_dir/* | tail -$depth`
do
bin/nutch index $segment
done
# De-duplicate indexes
# "bogus" argument is ignored but needed due to
# a bug in the number of args expected
bin/nutch dedup $segments_dir bogus
# Merge indexes
ls -d $segments_dir/* | xargs bin/nutch merge $index_dir
= Version 0.8.0 =
Place in the bin sub-directory within Nutch and run.
== Example Usage ==
./usr/local/nutch/bin/recrawl /usr/local/tomcat/webapps/ROOT /usr/local/nutch/crawl 10 30
== Code ==
#!/bin/bash
# Nutch recrawl script.
# Based on 0.7.2 script at http://today.java.net/pub/a/today/2006/02/16/introduction-to-nutch-2.html
# Modified by Matthew Holt
if [ -n "$1" ]
then
tomcat_dir=$1
else
echo "Usage: recrawl servlet_path crawl_dir [depth] [adddays]"
echo "servlet_path - Path of the nutch servlet (i.e. /usr/local/tomcat/webapps/ROOT)"
echo "crawl_dir - Path of the directory the crawl is located in."
echo "[depth] - The link depth from the root page that should be crawled."
echo "[adddays] - Advance the clock # of days for fetchlist generation."
exit 1
fi
if [ -n "$2" ]
then
crawl_dir=$2
else
echo "Usage: recrawl servlet_path crawl_dir [depth] [adddays]"
echo "servlet_path - Path of the nutch servlet (i.e. /usr/local/tomcat/webapps/ROOT)"
echo "crawl_dir - Path of the directory the crawl is located in."
echo "[depth] - The link depth from the root page that should be crawled."
echo "[adddays] - Advance the clock # of days for fetchlist generation."
exit 1
fi
if [ -n "$3" ]
then
depth=$3
else
depth=5
fi
if [ -n "$4" ]
then
adddays=$4
else
adddays=0
fi
#Sets the path to bin
nutch_dir=`dirname $0`
# Only change if your crawl subdirectories are named something different
webdb_dir=$crawl_dir/crawldb
segments_dir=$crawl_dir/segments
linkdb_dir=$crawl_dir/linkdb
index_dir=$crawl_dir/index
# The generate/fetch/update cycle
for ((i=1; i <= depth ; i++))
do
$nutch_dir/nutch generate $webdb_dir $segments_dir -adddays $adddays
segment=`ls -d $segments_dir/* | tail -1`
$nutch_dir/nutch fetch $segment
$nutch_dir/nutch updatedb $webdb_dir $segment
done
# Update segments
$nutch_dir/nutch invertlinks $linkdb_dir -dir $segments_dir
# Index segments
new_indexes=$crawl_dir/newindexes
#ls -d $segments_dir/* | tail -$depth | xargs
$nutch_dir/nutch index $new_indexes $webdb_dir $linkdb_dir $segments_dir/*
# De-duplicate indexes
$nutch_dir/nutch dedup $new_indexes
# Merge indexes
$nutch_dir/nutch merge $index_dir $new_indexes
# Tell Tomcat to reload index
touch $tomcat_dir/WEB-INF/web.xml
# Clean up
rm -rf $new_indexes