You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@jena.apache.org by rv...@apache.org on 2015/07/07 14:53:05 UTC

[06/18] jena git commit: Further refactoring of tdbloader2 scripts (JENA-977)

Further refactoring of tdbloader2 scripts (JENA-977)

- Move common functions into tdbloader2common script
- Remove duplicated definitions from other scripts and source in the new
  common script
- Add helper function for getting drive information
- Add check in tdbloader2index script which will abort the build if
  there is insufficient free space to sort the data file since the
  sorted output will be same size in the input so if there are fewer
  bytes free than the size of the input we can abort early


Project: http://git-wip-us.apache.org/repos/asf/jena/repo
Commit: http://git-wip-us.apache.org/repos/asf/jena/commit/c55c1f74
Tree: http://git-wip-us.apache.org/repos/asf/jena/tree/c55c1f74
Diff: http://git-wip-us.apache.org/repos/asf/jena/diff/c55c1f74

Branch: refs/heads/master
Commit: c55c1f74b4571eee2c9e333967b5671e862adff7
Parents: 3c59213
Author: Rob Vesse <rv...@apache.org>
Authored: Mon Jun 29 17:21:18 2015 +0100
Committer: Rob Vesse <rv...@apache.org>
Committed: Mon Jun 29 17:21:18 2015 +0100

----------------------------------------------------------------------
 apache-jena/bin/tdbloader2       |  29 ++++-----
 apache-jena/bin/tdbloader2common |  85 +++++++++++++++++++++++++++
 apache-jena/bin/tdbloader2data   |  53 ++++++++---------
 apache-jena/bin/tdbloader2index  | 107 +++++++++++++++-------------------
 4 files changed, 169 insertions(+), 105 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/jena/blob/c55c1f74/apache-jena/bin/tdbloader2
----------------------------------------------------------------------
diff --git a/apache-jena/bin/tdbloader2 b/apache-jena/bin/tdbloader2
index 55a0faf..b7a1af2 100755
--- a/apache-jena/bin/tdbloader2
+++ b/apache-jena/bin/tdbloader2
@@ -77,12 +77,10 @@ EOF
 }
 
 # If JENA_HOME is empty
-if [ -z "$JENA_HOME" ]
-	then
-    SCRIPT="$0"
-    # Catch common issue: script has been symlinked
-	if [ -L "$SCRIPT" ]
-		then
+if [ -z "$JENA_HOME" ];	then
+  SCRIPT="$0"
+  # Catch common issue: script has been symlinked
+	if [ -L "$SCRIPT" ]; then
 		SCRIPT="$(readlink "$0")"
 		# If link is relative
 		case "$SCRIPT" in
@@ -91,9 +89,10 @@ if [ -z "$JENA_HOME" ]
 		esac
 	fi
 
-    # Work out root from script location
-    JENA_HOME="$( cd "$( dirname "$SCRIPT" )/.." && pwd )"
+  # Work out root from script location
+  JENA_HOME="$( cd "$( dirname "$SCRIPT" )/.." && pwd )"
 fi
+source "${JENA_HOME}/bin/tdbloader2common"
 
 # ---- Setup
 JVM_ARGS=${JVM_ARGS:--Xmx1024M}
@@ -189,13 +188,8 @@ if [ $TRACE = 1 ]; then
   COMMON_ARGS="$COMMON_ARGS --trace"
 fi
 
-log() { echo " $(date $DATE)" "$@" ; }
-
-#DATE="+%Y-%m-%dT%H:%M:%S%:z"
-DATE="+%H:%M:%S"
-
 # ---- Start
-log "-- TDB Bulk Loader Start"
+info "-- TDB Bulk Loader Start"
 TIME1="$(date +%s)"
 
 TOOL_DIR="$JENA_HOME/bin"
@@ -211,13 +205,12 @@ case "$PHASE" in
     exec "${TOOL_DIR}/tdbloader2index" $COMMON_ARGS --loc "$LOC"
     ;;
   *)
-    echo "Unrecognized phase $PHASE" 1>&2
-    exit 1
+    abort 1 "Unrecognized phase $PHASE"
     ;;
 esac
 
 # ---- End
 TIME2="$(date +%s)"
-log "-- TDB Bulk Loader Finish"
+info "-- TDB Bulk Loader Finish"
 ELAPSED=$(($TIME2-$TIME1))
-log "-- $ELAPSED seconds"
\ No newline at end of file
+info "-- $ELAPSED seconds"
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/jena/blob/c55c1f74/apache-jena/bin/tdbloader2common
----------------------------------------------------------------------
diff --git a/apache-jena/bin/tdbloader2common b/apache-jena/bin/tdbloader2common
new file mode 100644
index 0000000..beae115
--- /dev/null
+++ b/apache-jena/bin/tdbloader2common
@@ -0,0 +1,85 @@
+#!/usr/bin/env bash
+
+## Licensed to the Apache Software Foundation (ASF) under one
+## or more contributor license agreements.  See the NOTICE file
+## distributed with this work for additional information
+## regarding copyright ownership.  The ASF licenses this file
+## to you under the Apache License, Version 2.0 (the
+## "License"); you may not use this file except in compliance
+## with the License.  You may obtain a copy of the License at
+##
+##     http://www.apache.org/licenses/LICENSE-2.0
+##
+## Unless required by applicable law or agreed to in writing, software
+## distributed under the License is distributed on an "AS IS" BASIS,
+## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+## See the License for the specific language governing permissions and
+## limitations under the License.
+
+function log() {
+  echo " $(date $DATE)" "$@"
+}
+
+function debug() {
+ if [ $DEBUG = 1 ]; then
+   log "DEBUG" "$@"
+ fi
+}
+
+function info() {
+  log "INFO" "$@"
+}
+
+function warn() {
+  log "WARN" "$@" 1>&2
+}
+
+function error() {
+  log "ERROR" "$@" 1>&2
+}
+
+function abort() {
+  local EXIT=$1
+
+  # Trick to check for numeric
+  # -eq only returns true if the value is integer equals
+  if [ "$EXIT" -eq "$EXIT" ]; then
+    # Can use the provided exit code
+    shift
+  else
+    # Caller forgot to provide an exit code so use default of 1
+    EXIT=1
+  fi
+
+  # Log error and exit
+  error "$@"
+  exit $EXIT
+}
+
+function getSize() {
+  ls -l $1 | awk '{print $5}'
+}
+
+function getDriveInfo() {
+  local DIR=$1
+
+  local DRIVE_INFO=$(df "$DIR" | tail -n +2)
+  local DISK=$(echo $DRIVE_INFO | awk '{print $1}')
+  local FREE_BYTES=$(echo $DRIVE_INFO | awk '{print $4}')
+  local USED_PERCENT=$(echo $DRIVE_INFO | awk '{print $5}')
+  USED_PERCENT=${USED_PERCENT/"%"/}
+  local FREE_PERCENT=$((100 - $USED_PERCENT))
+
+  local INFO=()
+  INFO[0]=$DISK
+  INFO[1]=$USED_PERCENT
+  INFO[2]=$FREE_PERCENT
+  INFO[3]=$FREE_BYTES
+
+  echo ${INFO[@]}
+}
+
+#DATE="+%Y-%m-%dT%H:%M:%S%:z"
+DATE="+%H:%M:%S"
+
+PKG=org.apache.jena.tdb.store.bulkloader2
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/jena/blob/c55c1f74/apache-jena/bin/tdbloader2data
----------------------------------------------------------------------
diff --git a/apache-jena/bin/tdbloader2data b/apache-jena/bin/tdbloader2data
index eaf9069..6904c83 100755
--- a/apache-jena/bin/tdbloader2data
+++ b/apache-jena/bin/tdbloader2data
@@ -18,6 +18,13 @@
 
 # The environment for this sub-script is setup by "tdbloader2"
 
+# Pull in common functions
+if [ -z "$JENA_HOME" ]; then
+  echo "JENA_HOME is not set"
+  exit 1
+fi
+source "${JENA_HOME}/bin/tdbloader2common"
+
 function printUsage() {
   cat << EOF
 tdbloader2data - TDB Bulk Loader - Data Phase
@@ -71,19 +78,6 @@ EOF
 # Exit on error.
 set -e
 
-log() { echo " $(date $DATE)" "$@" ; }
-
-function debug() {
- if [ $DEBUG = 1 ]; then
-   log "DEBUG" "$@"
- fi
-}
-
-#DATE="+%Y-%m-%dT%H:%M:%S%:z"
-DATE="+%H:%M:%S"
-
-PKG=org.apache.jena.tdb.store.bulkloader2
-
 # Process Arguments
 LOC=
 KEEP_WORK=0
@@ -133,15 +127,18 @@ do
 done
 
 # Verify arguments
-if [ -z "$LOC" ] ; then echo "No location specified" ; exit 1 ; fi
-if [ $# = 0 ]; then echo "No data files specified" ; exit 1 ; fi
+if [ -z "$LOC" ]; then
+  abort 1 "No location specified"
+fi
+if [ $# = 0 ]; then
+  abort 1 "No data files specified"
+fi
 
 # Look for any index and data files in the directory.
 # Skip a possible configuration file
 if test -n "$(find "$LOC" -maxdepth 1 -type f ! -name 'this.*' -print -quit)"
 then 
-    echo "Location is not empty: $LOC"
-    exit 1
+    abort 1 "Location is not empty: $LOC"
 fi
 
 if [ ! -e "$LOC" ] ; then
@@ -149,12 +146,13 @@ if [ ! -e "$LOC" ] ; then
   debug "Trying to create new database directory: $LOC"
   mkdir "$LOC"
   if [ $? != 0 ]; then
-    echo "Failed to create new directory: $LOC"
-    exit 1
+    abort 1 "Failed to create new directory: $LOC"
   fi
   debug "New database directory created: $LOC"
 fi
-if [ ! -d "$LOC" ] ; then echo "Location is not a directory: $LOC" ; exit 1 ; fi
+if [ ! -d "$LOC" ]; then
+  abort 1 "Location is not a directory: $LOC"
+fi
 
 # TODO Make LOC absolute
 
@@ -165,22 +163,21 @@ JVM_ARGS=${JVM_ARGS:--Xmx1200M}
 debug "JVM Arguments are $JVM_ARGS"
 
 # Classpath set in "tdbloader2"
-if [ -z "$JENA_CP" ]
-then
-    echo "Classpath not provided : set JENA_CP" 1>&2
-    exit 1
+if [ -z "$JENA_CP" ]; then
+  abort 1 "Classpath not provided : set JENA_CP"
 fi
 
 # ---- Data loading phase
-log "Data Load Phase"
-# Produce nodes file and triples/quads text file.
+info "Data Load Phase"
 
+# Produce nodes file and triples/quads text file.
 DATA_TRIPLES="$LOC/data-triples.tmp"
 DATA_QUADS="$LOC/data-quads.tmp"
 
-debug "Data files are $DATA_TRIPLES and $DATA_QUADS"
+debug "Triples text files is $DATA_TRIPLES"
+debug "Quads text file is $DATA_QUADS"
 
 java $JVM_ARGS -cp "$JENA_CP" "$PKG".CmdNodeTableBuilder \
     "--loc=$LOC" "--triples=$DATA_TRIPLES" "--quads=$DATA_QUADS" $FILES
 
-log "Data Load Phase Completed"
+info "Data Load Phase Completed"

http://git-wip-us.apache.org/repos/asf/jena/blob/c55c1f74/apache-jena/bin/tdbloader2index
----------------------------------------------------------------------
diff --git a/apache-jena/bin/tdbloader2index b/apache-jena/bin/tdbloader2index
index f506df9..5de8d6a 100755
--- a/apache-jena/bin/tdbloader2index
+++ b/apache-jena/bin/tdbloader2index
@@ -18,6 +18,13 @@
 
 # The environment for this sub-script is setup by "tdbloader2"
 
+# Pull in common functions
+if [ -z "$JENA_HOME" ]; then
+  echo "JENA_HOME is not set"
+  exit 1
+fi
+source "${JENA_HOME}/bin/tdbloader2common"
+
 function printUsage() {
   cat << EOF
 tdbloader2index - TDB Bulk Loader - Index Phase
@@ -70,27 +77,6 @@ set -e
 # Sort order is ASCII
 export LC_ALL="C"
 
-log() { echo " $(date $DATE)" "$@" ; }
-
-function debug() {
- if [ $DEBUG = 1 ]; then
-   log "DEBUG" "$@"
- fi
-}
-
-function warn() {
-  log "WARN" "$@"
-}
-
-function getSize() {
-  ls -l $1 | awk '{print $5}'
-}
-
-#DATE="+%Y-%m-%dT%H:%M:%S%:z"
-DATE="+%H:%M:%S"
-
-PKG=org.apache.jena.tdb.store.bulkloader2
-
 # Process Arguments
 LOC=
 KEEP_WORK=0
@@ -138,22 +124,26 @@ do
 done
 
 # Verify arguments
-if [ -z "$LOC" ] ; then echo "No location specified" ; exit 1 ; fi
-if [ ! -e "$LOC" ] ; then echo "Location specified does not exist: $LOC" ; exit 1; fi
-if [ ! -d "$LOC" ] ; then echo "Location is not a directory: $LOC" ; exit 1 ; fi
+if [ -z "$LOC" ]; then
+  abort 1 "No location specified"
+fi
+if [ ! -e "$LOC" ]; then
+  abort 1 "Location specified does not exist: $LOC"
+fi
+if [ ! -d "$LOC" ]; then
+  abort 1 "Location is not a directory: $LOC"
+fi
 
 # TODO Make LOC absolute
 
 DATA_TRIPLES="$LOC/data-triples.tmp"
 DATA_QUADS="$LOC/data-quads.tmp"
 
-if [ ! -e "$DATA_TRIPLES" ] ; then
-  echo "No triples data file found in location, please run the tdbloader2data script first"
-  exit 1
+if [ ! -e "$DATA_TRIPLES" ]; then
+  abort 1 "No triples text file found in location, please run the tdbloader2data script first"
 fi
 if [ ! -e "$DATA_QUADS" ]; then
-  echo "No quads data file found in location, please run the tdbloader2data script first"
-  exit 1
+  abort 1 "No quads text file found in location, please run the tdbloader2data script first"
 fi
 
 debug "Data files are $DATA_TRIPLES and $DATA_QUADS"
@@ -164,16 +154,14 @@ JVM_ARGS=${JVM_ARGS:--Xmx1200M}
 debug "JVM Arguments are $JVM_ARGS"
 
 # Classpath set in "tdbloader2"
-if [ -z "$JENA_CP" ]
-then
-    echo "Classpath not provided : set JENA_CP" 1>&2
-    exit 1
+if [ -z "$JENA_CP" ]; then
+  abort 1 "Classpath not provided : set JENA_CP"
 fi
 debug "Jena Classpath is $JENA_CP"
 
 # ---- Index intermediates
 ## All files are writtern S P O / G S P O columns per row but in different sort orders.
-log "Index Building Phase"
+info "Index Building Phase"
 
 # Check whether Pipe Viewer is available
 # Needs to temporarily disable exit on error as which produces an error
@@ -204,21 +192,14 @@ else
   SORT_TEMP_DIR="$TMPDIR"
 fi
 debug "Sort Temp Directory: $SORT_TEMP_DIR"
+SORT_DRIVE_INFO=($(getDriveInfo "${SORT_TEMP_DIR}"))
+debug "Sort Temp Directory ${DIR} is on disk ${SORT_DRIVE_INFO[0]} which has ${SORT_DRIVE_INFO[2]}% free space (${SORT_DRIVE_INFO[3]} bytes)"
 
-# Find out how much space is on the sort directory
-SORT_DRIVE_INFO=$(df "$SORT_TEMP_DIR" | tail -n +2)
-SORT_DRIVE_DISK=$(echo $SORT_DRIVE_INFO | awk '{print $1}')
-SORT_DRIVE_FREE_SPACE=$(echo $SORT_DRIVE_INFO | awk '{print $4}')
-SORT_DRIVE_USED=$(echo $SORT_DRIVE_INFO | awk '{print $5}')
-SORT_DRIVE_FREE=${SORT_DRIVE_USED/"%"/}
-SORT_DRIVE_FREE=$((100 - $SORT_DRIVE_FREE))
-debug "Sort Temp Directory ${SORT_TEMP_DIR} is on disk ${SORT_DRIVE_DISK} which has ${SORT_DRIVE_FREE}% free space (${SORT_DRIVE_FREE_SPACE} bytes)"
-
-if [ $SORT_DRIVE_FREE -le 10 ]; then
-  echo
-  warn "Sort Temp Directory ${SORT_TEMP_DIR} is on disk ${SORT_DRIVE_DISK} which only has ${SORT_DRIVE_FREE}% free space (${SORT_DRIVE_FREE_SPACE} bytes) available"
+if [ "${SORT_DRIVE_INFO[2]}" -le 10 ]; then
+  warn "-----"
+  warn "Sort Temp Directory ${SORT_TEMP_DIR} is on disk ${SORT_DRIVE_INFO[0]} which only has ${SORT_DRIVE_INFO[2]}% free space (${SORT_DRIVE_INFO[3]} bytes) available"
   warn "This may result in sort failures if the data to be indexed is large"
-  echo
+  warn "-----"
 fi
 
 generate_index()
@@ -233,17 +214,26 @@ generate_index()
 	    return
 	  fi
 
-    log "Creating Index $IDX"
+    info "Creating Index $IDX"
+
+    # For various purposes we need to know the size of the input data
+    local SIZE=$(getSize "$DATA")
+    debug "Size of data to be sorted is $SIZE bytes"
+
+    # Verify that we have enough space to sort the data
+    local WORK_DRIVE_INFO=($(getDriveInfo "${WORK}"))
+    if [ "${SIZE}" -ge "${WORK_DRIVE_INFO[3]}" ]; then
+      abort 1 "Insufficient free space on database drive ${WORK_DRIVE_INFO[0]}, there are ${WORK_DRIVE_INFO[3]} bytes free but ${SIZE} bytes are required"
+    else
+      debug "Sufficient free space on database drive ${WORK_DRIVE_INFO[0]} to attempt sorting data file ${DATA} (${SIZE} bytes required from ${WORK_DRIVE_INFO[3]} bytes free)"
+    fi
 
     # Sort the input data
-    log "Sort $IDX"
+    info "Sort $IDX"
     debug "Sorting $DATA into work file $WORK"
     if [ $HAS_PV = 0 ]; then
       # Use pv (pipe viewer) to monitor sort progress
       # Note that progress data will only be seen if running in the foreground
-      # To report progress need to know size of input data
-      SIZE=$(getSize "$DATA")
-      debug "Size of data to be sorted is $SIZE bytes"
 
       pv -c -N data < "$DATA" | sort $SORT_ARGS -u $KEYS | pv -c -N sort -s $SIZE > $WORK
 
@@ -252,24 +242,23 @@ generate_index()
       # and we'll continue onwards
       # Therefore we need to check that the output size is same as input size as this is
       # the only way to tell if sort suceeded
-      OUTPUT_SIZE=$(getSize "$WORK")
+      local OUTPUT_SIZE=$(getSize "$WORK")
       debug "Size of sorted data is $OUTPUT_SIZE bytes"
       if [ $SIZE != $OUTPUT_SIZE ]; then
-        log "Aborting due to sort error"
-        exit 1
+        abort 1 "Aborting due to sort error, see preceding output for error from sort"
       fi
     else
       # Use sort without any progress monitoring
       sort $SORT_ARGS -u $KEYS < "$DATA" > $WORK
     fi
-    log "Sort $IDX Completed"
+    info "Sort $IDX Completed"
 
     # Build into an index
-    log "Build $IDX"
+    info "Build $IDX"
     rm -f "$LOC/$IDX.dat"
     rm -f "$LOC/$IDX.idn"
     java -cp "$JENA_CP" "$PKG".CmdIndexBuild "$LOC" "$IDX" "$WORK"
-    log "Build $IDX Completed"
+    info "Build $IDX Completed"
 
     # Remove work file unless keeping
     if [ $KEEP_WORK = 0 ]; then
@@ -301,7 +290,7 @@ generate_index "$K3 $K4 $K2 $K1" "$DATA_QUADS" POSG
 
 generate_index "$K4 $K2 $K3 $K1" "$DATA_QUADS" OSPG
 
-log "Index Building Phase Completed"
+info "Index Building Phase Completed"
 
 # ---- Clean up.
 if [ $KEEP_WORK = 0 ]; then