You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lenya.apache.org by mi...@apache.org on 2004/01/09 11:13:34 UTC
cvs commit: cocoon-lenya/src/webapp/lenya/bin crawl_and_index.sh crawl_and_index.xml
michi 2004/01/09 02:13:34
Modified: src/webapp/lenya/bin crawl_and_index.sh crawl_and_index.xml
Log:
xpdf as target added
Revision Changes Path
1.22 +14 -41 cocoon-lenya/src/webapp/lenya/bin/crawl_and_index.sh
Index: crawl_and_index.sh
===================================================================
RCS file: /home/cvs/cocoon-lenya/src/webapp/lenya/bin/crawl_and_index.sh,v
retrieving revision 1.21
retrieving revision 1.22
diff -u -r1.21 -r1.22
--- crawl_and_index.sh 7 Dec 2003 17:17:53 -0000 1.21
+++ crawl_and_index.sh 9 Jan 2004 10:13:33 -0000 1.22
@@ -3,6 +3,7 @@
DIRNAME=`dirname $0`
echo "INFO: dirname = $DIRNAME"
+echo "INFO: HOME = $HOME"
WEBAPP_DIR=$HOME/src/cocoon-lenya/build/lenya/webapp
LIB_DIR=$WEBAPP_DIR/WEB-INF/lib
JAVA=/usr/lib/j2sdk1.4/bin/java
@@ -41,8 +42,20 @@
echo "=========================================================="
echo ""
;;
+ xpdf)
+ echo ""
+ echo "=========================================================="
+ echo "Target: $1"
+ echo "=========================================================="
+ echo ""
+
+ HTDOCS_DUMP_DIR=$2
+
+ echo "INFO: HTDOCS_DUMP_DIR = $HTDOCS_DUMP_DIR"
+ ##find $HTDOCS_DUMP_DIR -name "*.pdf" -print -exec $XPDF -htmlmeta {} {}.txt \;
+ ;;
*)
- echo "Usage: $0 {crawl|index}"
+ echo "Usage: $0 {crawl|index|xpdf}"
exit 1
;;
esac
@@ -73,54 +86,14 @@
-CRAWLER_CONF=$1
-LUCENE_CONF=$2
-
-if ! ([ $CRAWLER_CONF ] && [ $LUCENE_CONF ]);then
- echo ""
- echo "Usage: crawl_and_index.sh crawler.xconf lucene.xconf"
- exit 0
-fi
-
-echo ""
-echo "=========================================================="
-echo "Target: crawl"
-echo "=========================================================="
-echo ""
-$JAVA -cp $CLASSPATH org.apache.lenya.search.crawler.CrawlerEnvironment $CRAWLER_CONF
-#$JAVA -cp $CLASSPATH org.apache.lenya.search.crawler.IterativeHTMLCrawler $CRAWLER_CONF
-
-
echo ""
echo "=========================================================="
echo "Target: extract_text_from_pdf"
echo "=========================================================="
echo ""
-HTDOCS_DUMP_DIR=`$JAVA -cp $CLASSPATH org.apache.lenya.search.crawler.CrawlerEnvironment $CRAWLER_CONF -name htdocs-dump-dir`
-##find $HTDOCS_DUMP_DIR -name "*.pdf" -print -exec $XPDF -htmlmeta {} {}.txt \;
-find $HTDOCS_DUMP_DIR -name "*.pdf.txt" -print
-
-##$XPDF -htmlmeta $FILE_PDF $FILE_PDF.txt
CLASSPATH=$CLASSPATH:$PDFBOX/classes
##$JAVA -cp $CLASSPATH org.pdfbox.Main $FILE_PDF $FILE_PDF.txt
##http://www.adobe.com/products/acrobat/access_simple_form.html
-
-
-
-echo ""
-echo "=========================================================="
-echo "Target: Regression Test"
-echo "=========================================================="
-echo ""
-
-
-
-
-echo ""
-echo "=========================================================="
-echo "Target: Move index and htdocs_dump"
-echo "=========================================================="
-echo ""
1.19 +13 -5 cocoon-lenya/src/webapp/lenya/bin/crawl_and_index.xml
Index: crawl_and_index.xml
===================================================================
RCS file: /home/cvs/cocoon-lenya/src/webapp/lenya/bin/crawl_and_index.xml,v
retrieving revision 1.18
retrieving revision 1.19
diff -u -r1.18 -r1.19
--- crawl_and_index.xml 9 Dec 2003 16:56:32 -0000 1.18
+++ crawl_and_index.xml 9 Jan 2004 10:13:33 -0000 1.19
@@ -5,7 +5,7 @@
<target name="main" description="Says Hi" depends="crawl, index">
- <echo>Hi</echo>
+ <echo>Hello</echo>
</target>
@@ -14,12 +14,15 @@
<target name="init">
<echo message="INFO: Init"/>
- <property name="lenya.dir" value="../../../.."/>
+ <property name="lenya.dir" value="/home/username/src/cocoon-lenya"/>
+ <!--<property name="lenya.dir" value="../../../.."/>-->
+ <!--<property name="lenya.dir" value="."/>-->
<!-- Dummy arguments -->
<property name="crawler.xconf" value="src/webapp/lenya/pubs/oscom/config/search/crawler.xconf"/>
<property name="lucene.xconf" value="src/webapp/lenya/pubs/oscom/content/lucene.xconf"/>
<property name="debug" value="false"/>
+ <property name="htdocs.dump.dir" value="src/webapp/lenya/pubs/oscom/work/search/lucene"/>
<!-- /Dummy arguments -->
<property name="lenya.dir.web-inf" value="${lenya.dir}/build/lenya/webapp/WEB-INF"/>
@@ -64,7 +67,7 @@
<classpath refid="class.path"/>
</java>
- <exec executable="${lenya.dir}/lenya/bin/crawl_and_index.sh">
+ <exec executable="${lenya.dir}/src/webapp/lenya/bin/crawl_and_index.sh">
<arg value="index"/>
<arg value="${lucene.xconf}"/>
<arg value="${debug}"/>
@@ -101,8 +104,13 @@
<echo>INFO: Publish Index</echo>
</target>
-<target name="pdf" description="Extract text from PDF" depends="init">
- <echo>INFO: Extract text from PDF</echo>
+<target name="xpdf" description="Extract text from PDF with Xpdf" depends="init">
+ <echo>INFO: Extract text from PDF with Xpdf (${htdocs.dump.dir})</echo>
+
+ <exec executable="${lenya.dir}/src/webapp/lenya/bin/crawl_and_index.sh">
+ <arg value="xpdf"/>
+ <arg value="${htdocs.dump.dir}"/>
+ </exec>
</target>
<target name="show-config" description="Show configuration" depends="init">
---------------------------------------------------------------------
To unsubscribe, e-mail: lenya-cvs-unsubscribe@cocoon.apache.org
For additional commands, e-mail: lenya-cvs-help@cocoon.apache.org