You are viewing a plain text version of this content. The canonical link for it is here.
Posted to user@nutch.apache.org by nalgonda <de...@gmail.com> on 2008/08/25 09:19:13 UTC

schedule recrawling in nutch

Hi ,

i found 3 scripts for schedule recrawling 
but my doubt is it run on command prompt(means scripts) or
create a file 
if we create a file where we put this?
if any one idean plz share it

#!/bin/sh
sh /apps/Linux64/nutch/bin/nutch.sh > /dev/null
sh /apps/Linux64/nutch/bin/nutch-merge.sh > /dev/null

It calls a script to crawl and then another to merge the indexes...





This does the crawl
#!/bin/sh
#
#ce 11/06/08
#update the nutch databases


NUTCH_HOME=/apps/Linux64/nutch/nutch-1.0-dev
NUTCH=${NUTCH_HOME}/bin/nutch
CRAWL=/user_data/ARCHIVE/nutch2
DEPTH=10
export JAVA_HOME=/apps/Linux64/nutch/jdk

#----- Inject
$NUTCH inject ${CRAWL}/crawldb ${NUTCH_HOME}/urls/


#----- Generate, Fetch, Parse, Update
for((i=0; i < $DEPTH; i++))
do
        echo "Generating pass $i\n"

        #$NUTCH generate ${CRAWL}/crawldb ${CRAWL}/segments -adddays 31
        $NUTCH generate ${CRAWL}/crawldb ${CRAWL}/segments -topN 3000 
-adddays 61

        SEG=`ls -d ${CRAWL}/segments/* | tail -1`
        $NUTCH fetch2 $SEG
        #$NUTCH fetch $SEG

        $NUTCH updatedb ${CRAWL}/crawldb -dir ${CRAWL}/segments -filter 
-normalize

done
#end_loop


#----- Merge Segments
$NUTCH mergesegs ${CRAWL}/merged_segments ${CRAWL}/segments/* -filter
rm -rf ${CRAWL}/segments/*
mv  ${CRAWL}/merged_segments/* ${CRAWL}/segments
rmdir ${CRAWL}/merged_segments

#--------Merge links
#

#----- Invert Links
$NUTCH invertlinks ${CRAWL}/linkdb ${CRAWL}/segments/*


#----- Index
#$NUTCH index ${CRAWL}/NEWindexes ${CRAWL}/crawldb ${CRAWL}/linkdb 
${CRAWL}/segments/*
$NUTCH index ${CRAWL}/NEWindexes ${CRAWL}/crawldb ${CRAWL}/linkdb 
${CRAWL}/segments/*

#----- Dedup
$NUTCH dedup ${CRAWL}/NEWindexes

if [ -d ${CRAWL}/index  ] ; then
rm -rf  ${CRAWL}/index
fi

#----- Merge indexes
$NUTCH merge ${CRAWL}/index ${CRAWL}/NEWindexes





This merges the two crawls....

yap /apps/Linux64/nutch/bin # cat /apps/Linux64/nutch/bin/nutch-merge.sh
#!/bin/sh
#
#ce 11/06/08
#update the nutch databases


NUTCH_HOME=/apps/Linux64/nutch/nutch-1.0-dev
NUTCH=${NUTCH_HOME}/bin/nutch

#tomcat stop/start
TOMCAT=/etc/init.d/tomcatsearch

#final dest of the crawl - populated and accessed by tomcat
CRAWL=/user_data/ARCHIVE/nutch

#keep the old crawl in case it goes pear-shaped
OLD_CRAWL=/user_data/ARCHIVE/nutch-old

#Temp merged CRAWL is the destination - must be empty
MERGED_CRAWL=/user_data/ARCHIVE/nutch.$$

#our input crawls
CRAWL2=/user_data/ARCHIVE/nutch
CRAWL3=/user_data/ARCHIVE/nutch2

export JAVA_HOME=/apps/Linux64/nutch/jdk



$NUTCH mergedb ${MERGED_CRAWL}/crawldb ${CRAWL2}/crawldb 
${CRAWL3}/crawldb -filter -normalize


$NUTCH mergelinkdb ${MERGED_CRAWL}/linkdb ${CRAWL2}/linkdb 
${CRAWL3}/linkdb  -normalize

#----- Merge Segments
$NUTCH mergesegs ${MERGED_CRAWL}/segments ${CRAWL2}/segments/* 
${CRAWL3}/segments/* -filter


#----- Invert Links
$NUTCH invertlinks ${MERGED_CRAWL}/linkdb ${MERGED_CRAWL}/segments/*


#----- Index
$NUTCH index ${MERGED_CRAWL}/NEWindexes ${MERGED_CRAWL}/crawldb 
${MERGED_CRAWL}/linkdb ${MERGED_CRAWL}/segments/*

#----- Dedup
$NUTCH dedup ${MERGED_CRAWL}/NEWindexes

#----- Merge indexes
$NUTCH merge ${MERGED_CRAWL}/index ${MERGED_CRAWL}/NEWindexes



#now move the whole thing into place
$TOMCAT stop

# replace indexes with indexes_merged
if [ -d ${OLD_CRAWL}  ] ; then
        rm -rf  ${OLD_CRAWL}
fi


mv ${CRAWL} ${OLD_CRAWL}
mv ${MERGED_CRAWL} ${CRAWL}


$TOMCAT touch
$TOMCAT start





if [ -d ${CRAWL}/NEWindexes  ] ; then
rm -rf  ${CRAWL}/NEWindexes
fi


-- 
View this message in context: http://www.nabble.com/schedule-recrawling-in-nutch-tp19139009p19139009.html
Sent from the Nutch - User mailing list archive at Nabble.com.