You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lenya.apache.org by mi...@apache.org on 2004/01/09 11:13:34 UTC

cvs commit: cocoon-lenya/src/webapp/lenya/bin crawl_and_index.sh crawl_and_index.xml

michi       2004/01/09 02:13:34

  Modified:    src/webapp/lenya/bin crawl_and_index.sh crawl_and_index.xml
  Log:
  xpdf as target added
  
  Revision  Changes    Path
  1.22      +14 -41    cocoon-lenya/src/webapp/lenya/bin/crawl_and_index.sh
  
  Index: crawl_and_index.sh
  ===================================================================
  RCS file: /home/cvs/cocoon-lenya/src/webapp/lenya/bin/crawl_and_index.sh,v
  retrieving revision 1.21
  retrieving revision 1.22
  diff -u -r1.21 -r1.22
  --- crawl_and_index.sh	7 Dec 2003 17:17:53 -0000	1.21
  +++ crawl_and_index.sh	9 Jan 2004 10:13:33 -0000	1.22
  @@ -3,6 +3,7 @@
   DIRNAME=`dirname $0`
   echo "INFO: dirname = $DIRNAME"
   
  +echo "INFO: HOME = $HOME"
   WEBAPP_DIR=$HOME/src/cocoon-lenya/build/lenya/webapp
   LIB_DIR=$WEBAPP_DIR/WEB-INF/lib
   JAVA=/usr/lib/j2sdk1.4/bin/java
  @@ -41,8 +42,20 @@
           echo "=========================================================="
           echo ""
   	;;
  +    xpdf)
  +        echo ""
  +        echo "=========================================================="
  +        echo "Target: $1"
  +        echo "=========================================================="
  +        echo ""
  +
  +        HTDOCS_DUMP_DIR=$2
  +
  +        echo "INFO: HTDOCS_DUMP_DIR = $HTDOCS_DUMP_DIR"
  +        ##find $HTDOCS_DUMP_DIR -name "*.pdf" -print -exec $XPDF -htmlmeta {} {}.txt \;
  +	;;
       *)
  -        echo "Usage: $0 {crawl|index}"
  +        echo "Usage: $0 {crawl|index|xpdf}"
           exit 1
           ;;
   esac
  @@ -73,54 +86,14 @@
   
   
   
  -CRAWLER_CONF=$1
  -LUCENE_CONF=$2
  -
  -if ! ([ $CRAWLER_CONF ] && [ $LUCENE_CONF ]);then
  -  echo ""
  -  echo "Usage: crawl_and_index.sh crawler.xconf lucene.xconf"
  -  exit 0
  -fi
  -
  -echo ""
  -echo "=========================================================="
  -echo "Target: crawl"
  -echo "=========================================================="
  -echo ""
  -$JAVA -cp $CLASSPATH org.apache.lenya.search.crawler.CrawlerEnvironment $CRAWLER_CONF
  -#$JAVA -cp $CLASSPATH org.apache.lenya.search.crawler.IterativeHTMLCrawler $CRAWLER_CONF
  -
  -
   
   echo ""
   echo "=========================================================="
   echo "Target: extract_text_from_pdf"
   echo "=========================================================="
   echo ""
  -HTDOCS_DUMP_DIR=`$JAVA -cp $CLASSPATH org.apache.lenya.search.crawler.CrawlerEnvironment $CRAWLER_CONF -name htdocs-dump-dir`
  -##find $HTDOCS_DUMP_DIR -name "*.pdf" -print -exec $XPDF -htmlmeta {} {}.txt \;
  -find $HTDOCS_DUMP_DIR -name "*.pdf.txt" -print
  -
  -##$XPDF -htmlmeta $FILE_PDF $FILE_PDF.txt
   
   CLASSPATH=$CLASSPATH:$PDFBOX/classes
   ##$JAVA -cp $CLASSPATH org.pdfbox.Main $FILE_PDF $FILE_PDF.txt
   
   ##http://www.adobe.com/products/acrobat/access_simple_form.html
  -
  -
  -
  -echo ""
  -echo "=========================================================="
  -echo "Target: Regression Test"
  -echo "=========================================================="
  -echo ""
  -
  -
  -
  -
  -echo ""
  -echo "=========================================================="
  -echo "Target: Move index and htdocs_dump"
  -echo "=========================================================="
  -echo ""
  
  
  
  1.19      +13 -5     cocoon-lenya/src/webapp/lenya/bin/crawl_and_index.xml
  
  Index: crawl_and_index.xml
  ===================================================================
  RCS file: /home/cvs/cocoon-lenya/src/webapp/lenya/bin/crawl_and_index.xml,v
  retrieving revision 1.18
  retrieving revision 1.19
  diff -u -r1.18 -r1.19
  --- crawl_and_index.xml	9 Dec 2003 16:56:32 -0000	1.18
  +++ crawl_and_index.xml	9 Jan 2004 10:13:33 -0000	1.19
  @@ -5,7 +5,7 @@
   
   
   <target name="main" description="Says Hi" depends="crawl, index">
  -  <echo>Hi</echo>
  +  <echo>Hello</echo>
   </target>
   
   
  @@ -14,12 +14,15 @@
   <target name="init">
     <echo message="INFO: Init"/>
   
  -  <property name="lenya.dir" value="../../../.."/>
  +  <property name="lenya.dir" value="/home/username/src/cocoon-lenya"/>
  +  <!--<property name="lenya.dir" value="../../../.."/>-->
  +  <!--<property name="lenya.dir" value="."/>-->
     
     <!-- Dummy arguments -->
     <property name="crawler.xconf" value="src/webapp/lenya/pubs/oscom/config/search/crawler.xconf"/>
     <property name="lucene.xconf" value="src/webapp/lenya/pubs/oscom/content/lucene.xconf"/>
     <property name="debug" value="false"/>
  +  <property name="htdocs.dump.dir" value="src/webapp/lenya/pubs/oscom/work/search/lucene"/>
     <!-- /Dummy arguments -->
   
     <property name="lenya.dir.web-inf" value="${lenya.dir}/build/lenya/webapp/WEB-INF"/>
  @@ -64,7 +67,7 @@
       <classpath refid="class.path"/>
     </java>
   
  -  <exec executable="${lenya.dir}/lenya/bin/crawl_and_index.sh">
  +  <exec executable="${lenya.dir}/src/webapp/lenya/bin/crawl_and_index.sh">
       <arg value="index"/>
       <arg value="${lucene.xconf}"/>
       <arg value="${debug}"/>
  @@ -101,8 +104,13 @@
     <echo>INFO: Publish Index</echo>
   </target>
   
  -<target name="pdf" description="Extract text from PDF" depends="init">
  -  <echo>INFO: Extract text from PDF</echo>
  +<target name="xpdf" description="Extract text from PDF with Xpdf" depends="init">
  +  <echo>INFO: Extract text from PDF with Xpdf (${htdocs.dump.dir})</echo>
  +
  +  <exec executable="${lenya.dir}/src/webapp/lenya/bin/crawl_and_index.sh">
  +    <arg value="xpdf"/>
  +    <arg value="${htdocs.dump.dir}"/>
  +  </exec>
   </target>
   
   <target name="show-config" description="Show configuration" depends="init">
  
  
  

---------------------------------------------------------------------
To unsubscribe, e-mail: lenya-cvs-unsubscribe@cocoon.apache.org
For additional commands, e-mail: lenya-cvs-help@cocoon.apache.org