You are viewing a plain text version of this content. The canonical link for it is here.
Posted to user@nutch.apache.org by nalgonda <de...@gmail.com> on 2008/08/25 09:19:13 UTC
schedule recrawling in nutch
Hi ,
i found 3 scripts for schedule recrawling
but my doubt is it run on command prompt(means scripts) or
create a file
if we create a file where we put this?
if any one idean plz share it
#!/bin/sh
sh /apps/Linux64/nutch/bin/nutch.sh > /dev/null
sh /apps/Linux64/nutch/bin/nutch-merge.sh > /dev/null
It calls a script to crawl and then another to merge the indexes...
This does the crawl
#!/bin/sh
#
#ce 11/06/08
#update the nutch databases
NUTCH_HOME=/apps/Linux64/nutch/nutch-1.0-dev
NUTCH=${NUTCH_HOME}/bin/nutch
CRAWL=/user_data/ARCHIVE/nutch2
DEPTH=10
export JAVA_HOME=/apps/Linux64/nutch/jdk
#----- Inject
$NUTCH inject ${CRAWL}/crawldb ${NUTCH_HOME}/urls/
#----- Generate, Fetch, Parse, Update
for((i=0; i < $DEPTH; i++))
do
echo "Generating pass $i\n"
#$NUTCH generate ${CRAWL}/crawldb ${CRAWL}/segments -adddays 31
$NUTCH generate ${CRAWL}/crawldb ${CRAWL}/segments -topN 3000
-adddays 61
SEG=`ls -d ${CRAWL}/segments/* | tail -1`
$NUTCH fetch2 $SEG
#$NUTCH fetch $SEG
$NUTCH updatedb ${CRAWL}/crawldb -dir ${CRAWL}/segments -filter
-normalize
done
#end_loop
#----- Merge Segments
$NUTCH mergesegs ${CRAWL}/merged_segments ${CRAWL}/segments/* -filter
rm -rf ${CRAWL}/segments/*
mv ${CRAWL}/merged_segments/* ${CRAWL}/segments
rmdir ${CRAWL}/merged_segments
#--------Merge links
#
#----- Invert Links
$NUTCH invertlinks ${CRAWL}/linkdb ${CRAWL}/segments/*
#----- Index
#$NUTCH index ${CRAWL}/NEWindexes ${CRAWL}/crawldb ${CRAWL}/linkdb
${CRAWL}/segments/*
$NUTCH index ${CRAWL}/NEWindexes ${CRAWL}/crawldb ${CRAWL}/linkdb
${CRAWL}/segments/*
#----- Dedup
$NUTCH dedup ${CRAWL}/NEWindexes
if [ -d ${CRAWL}/index ] ; then
rm -rf ${CRAWL}/index
fi
#----- Merge indexes
$NUTCH merge ${CRAWL}/index ${CRAWL}/NEWindexes
This merges the two crawls....
yap /apps/Linux64/nutch/bin # cat /apps/Linux64/nutch/bin/nutch-merge.sh
#!/bin/sh
#
#ce 11/06/08
#update the nutch databases
NUTCH_HOME=/apps/Linux64/nutch/nutch-1.0-dev
NUTCH=${NUTCH_HOME}/bin/nutch
#tomcat stop/start
TOMCAT=/etc/init.d/tomcatsearch
#final dest of the crawl - populated and accessed by tomcat
CRAWL=/user_data/ARCHIVE/nutch
#keep the old crawl in case it goes pear-shaped
OLD_CRAWL=/user_data/ARCHIVE/nutch-old
#Temp merged CRAWL is the destination - must be empty
MERGED_CRAWL=/user_data/ARCHIVE/nutch.$$
#our input crawls
CRAWL2=/user_data/ARCHIVE/nutch
CRAWL3=/user_data/ARCHIVE/nutch2
export JAVA_HOME=/apps/Linux64/nutch/jdk
$NUTCH mergedb ${MERGED_CRAWL}/crawldb ${CRAWL2}/crawldb
${CRAWL3}/crawldb -filter -normalize
$NUTCH mergelinkdb ${MERGED_CRAWL}/linkdb ${CRAWL2}/linkdb
${CRAWL3}/linkdb -normalize
#----- Merge Segments
$NUTCH mergesegs ${MERGED_CRAWL}/segments ${CRAWL2}/segments/*
${CRAWL3}/segments/* -filter
#----- Invert Links
$NUTCH invertlinks ${MERGED_CRAWL}/linkdb ${MERGED_CRAWL}/segments/*
#----- Index
$NUTCH index ${MERGED_CRAWL}/NEWindexes ${MERGED_CRAWL}/crawldb
${MERGED_CRAWL}/linkdb ${MERGED_CRAWL}/segments/*
#----- Dedup
$NUTCH dedup ${MERGED_CRAWL}/NEWindexes
#----- Merge indexes
$NUTCH merge ${MERGED_CRAWL}/index ${MERGED_CRAWL}/NEWindexes
#now move the whole thing into place
$TOMCAT stop
# replace indexes with indexes_merged
if [ -d ${OLD_CRAWL} ] ; then
rm -rf ${OLD_CRAWL}
fi
mv ${CRAWL} ${OLD_CRAWL}
mv ${MERGED_CRAWL} ${CRAWL}
$TOMCAT touch
$TOMCAT start
if [ -d ${CRAWL}/NEWindexes ] ; then
rm -rf ${CRAWL}/NEWindexes
fi
--
View this message in context: http://www.nabble.com/schedule-recrawling-in-nutch-tp19139009p19139009.html
Sent from the Nutch - User mailing list archive at Nabble.com.