You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by gs...@apache.org on 2011/08/24 14:30:25 UTC

svn commit: r1161072 - /mahout/trunk/integration/bin/prep_asf_mail_archives.sh

Author: gsingers
Date: Wed Aug 24 12:30:25 2011
New Revision: 1161072

URL: http://svn.apache.org/viewvc?rev=1161072&view=rev
Log:
MAHOUT-795: change the script to not download

Modified:
    mahout/trunk/integration/bin/prep_asf_mail_archives.sh

Modified: mahout/trunk/integration/bin/prep_asf_mail_archives.sh
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/bin/prep_asf_mail_archives.sh?rev=1161072&r1=1161071&r2=1161072&view=diff
==============================================================================
--- mahout/trunk/integration/bin/prep_asf_mail_archives.sh (original)
+++ mahout/trunk/integration/bin/prep_asf_mail_archives.sh Wed Aug 24 12:30:25 2011
@@ -8,16 +8,15 @@
 #   $1 - Path to this script's working directory, you will need about
 #        22GB of free space to run this script.
 #
-#   $2 - Path to where this script saves the SequenceFile output.
+#   $2 - Path to where the ASF Public Archive data is, untarred.
+#        If you are running Hadoop and the files are in HDFS, then
+#        this will need to be an HDFS path.   Default is $1/input
+#   $3 - Path to where this script saves the SequenceFile output.
 #        If you are running Hadoop and you want the sequence files
 #        saved to your HDFS then you need to set this value to an 
 #        HDFS path and make sure you set HADOOP_HOME so Mahout can
-#        find Hadoop.
+#        find Hadoop.  Default is $1/sequence-files
 #
-#   In addition, you will need to install, configure and add s3cmd
-#   to your PATH before running this script. s3cmd is needed to
-#   download the TAR files from Amazon S3, for more information, see:
-#      http://s3tools.org/s3cmd
 #
 # Required Environment Variables:
 #
@@ -28,7 +27,7 @@
 #          Only needed if you want to send output to HDFS
 #
 # Example:
-#   ./prep_asf_mail_archives.sh /mnt/asf-mail-archives /mnt/asf-mail-archives/output
+#   ./prep_asf_mail_archives.sh /mnt/asf-mail-archives /mnt/asf-archives/asf-mail-archives-7-18-2011 /mnt/asf-mail-archives/output
 #
 #   This will download the TAR files from S3, extract them, and then
 #   run the Mahout org.apache.mahout.text.SequenceFilesFromMailArchives job
@@ -56,12 +55,6 @@ if [ "$MAHOUT_HOME" = "" ]; then
   exit 1
 fi
 
-# Make sure they have s3cmd installed
-command -v s3cmd >/dev/null || { 
-  echo "Error: s3cmd command not found. See http://s3tools.org/s3cmd for more information."; 
-  exit 1; 
-}
-
 if [ "$1" = "" ]; then
   echo "Error: Please pass the path to your prep directory, such as /mnt/asf-mail-archives.\n\n\tUsage: $0 workingDir outputPath\n"
   exit 1
@@ -70,9 +63,16 @@ fi
 # Location where this script saves files
 PREP_DIR=$1
 
-# Change this to an HDFS path if you are running Hadoop
 if [ "$2" != "" ]; then
-  SEQFILE_OUTPUT_DIR=$2
+  SEQFILE_INPUT_DIR=$2
+else
+  SEQFILE_INPUT_DIR=$PREP_DIR/input
+fi
+
+
+# Change this to an HDFS path if you are running Hadoop
+if [ "$3" != "" ]; then
+  SEQFILE_OUTPUT_DIR=$3
 else
   SEQFILE_OUTPUT_DIR=$PREP_DIR/sequence-files
 fi
@@ -90,6 +90,7 @@ fi
 
 echo "Running $0 with:
   PREP_DIR = $PREP_DIR
+  SEQFILE_INPUT_DIR = $SEQFILE_INPUT_DIR
   SEQFILE_OUTPUT_DIR = $SEQFILE_OUTPUT_DIR
   MAHOUT_LOCAL = $MAHOUT_LOCAL
   HADOOP_HOME = $HADOOP_HOME"
@@ -97,101 +98,9 @@ echo "Running $0 with:
 # Run Mahout in Local mode! Remove this if you want the
 # sequence files stored in your HDFS
 
-mkdir -p $PREP_DIR/downloads $PREP_DIR/extracted
-
-# download the tar files from S3
-
-cd $PREP_DIR/downloads
-if [ ! -e public_a_d.tar ]
-then
-  echo "Downloading public_a_d.tar files from S3 to $PREP_DIR/downloads"
-  s3cmd get s3://asf-mail-archives/public_a_d.tar || {
-    echo "Download from S3 failed, check console for errors.";
-    exit 1;
-  }
-fi
-if [ ! -e public_e_k.tar ]
-then
-  echo "Downloading public_e_k.tar files from S3 to $PREP_DIR/downloads"
-  s3cmd get s3://asf-mail-archives/public_e_k.tar || {
-   echo "Download from S3 failed, check console for errors.";
-   exit 1;
-  }
-fi
-if [ ! -e public_l_o.tar ]
-then
-  echo "Downloading public_l_o.tar files from S3 to $PREP_DIR/downloads"
-  s3cmd get s3://asf-mail-archives/public_l_o.tar || {
-   echo "Download from S3 failed, check console for errors.";
-    exit 1;
-  }
-fi
-if [ ! -e public_s_t.tar ]
-then
-  echo "Downloading public_s_t.tar files from S3 to $PREP_DIR/downloads"
-  s3cmd get s3://asf-mail-archives/public_s_t.tar || {
-   echo "Download from S3 failed, check console for errors.";
-    exit 1;
-  }
-fi
-if [ ! -e public_u_z.tar ]
-then
-  echo "Downloading public_u_z.tar files from S3 to $PREP_DIR/downloads"
-  s3cmd get s3://asf-mail-archives/public_u_z.tar || {
-   echo "Download from S3 failed, check console for errors.";
-    exit 1;
-  }
-fi
-
-
-
-# extract the tar files to your local drive
-
-cd $PREP_DIR/extracted
-#check to see if we have already extracted
-if [ ! -e "$PREP_DIR/extracted/abdera.apache.org" ]
-then
-  echo "Extracting tar files from $PREP_DIR/downloads/public_a_d.tar"
-  tar xf $PREP_DIR/downloads/public_a_d.tar || {
-    echo "Extract TAR files failed, check console for errors.";
-    exit 1;
-  }
-fi
-if [ ! -e "$PREP_DIR/extracted/excalibur.apache.org" ]
-then
-  echo "Extracting tar files from $PREP_DIR/downloads/public_e_k.tar"
-  tar xf $PREP_DIR/downloads/public_e_k.tar || {
-    echo "Extract TAR files failed, check console for errors.";
-    exit 1;
-  }
-fi
-if [ ! -e "$PREP_DIR/extracted/labs.apache.org" ]
-then
-  echo "Extracting tar files from $PREP_DIR/downloads/public_l_o.tar"
-  tar xf $PREP_DIR/downloads/public_l_o.tar || {
-    echo "Extract TAR files failed, check console for errors.";
-    exit 1;
-  }
-fi
-if [ ! -e "$PREP_DIR/extracted/shale.apache.org" ]
-then
-  echo "Extracting tar files from $PREP_DIR/downloads/public_s_t.tar"
-  tar xf $PREP_DIR/downloads/public_s_t.tar || {
-    echo "Extract TAR files failed, check console for errors.";
-    exit 1;
-  }
-fi
-if [ ! -e "$PREP_DIR/extracted/uima.apache.org" ]
-then
-  echo "Extracting tar files from $PREP_DIR/downloads/public_u_z.tar"
-  tar xf $PREP_DIR/downloads/public_u_z.tar || {
-    echo "Extract TAR files failed, check console for errors.";
-    exit 1;
-  }
-fi
 
 # convert the extracted gz files into Hadoop SequenceFiles
 echo "Converting extracted directories to SequenceFiles ..."
 $MAHOUT_HOME/bin/mahout org.apache.mahout.text.SequenceFilesFromMailArchives \
---input $PREP_DIR/extracted --output $SEQFILE_OUTPUT_DIR \
+--input $SEQFILE_INPUT_DIR --output $SEQFILE_OUTPUT_DIR \
 -c UTF-8 -chunk 1024 -prefix asf_archives