You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@jena.apache.org by rv...@apache.org on 2015/07/07 14:53:03 UTC
[04/18] jena git commit: Various further improvements to the scripts
(JENA-977)
Various further improvements to the scripts (JENA-977)
- Validate sort temporary directory when indexing and WARN if the disk
it is on is low on space (10% or less free)
- Support --debug and --trace flags in all scripts, add various debug
output throughout scripts
- Fix a bug with not detecting sort failure when pv is used to monitor
progress
- Fix a bug in size calculations used for progress monitoring and sort
failure detection
This commit includes some temporary DEV changes that will be reverted
later
Project: http://git-wip-us.apache.org/repos/asf/jena/repo
Commit: http://git-wip-us.apache.org/repos/asf/jena/commit/7770596b
Tree: http://git-wip-us.apache.org/repos/asf/jena/tree/7770596b
Diff: http://git-wip-us.apache.org/repos/asf/jena/diff/7770596b
Branch: refs/heads/master
Commit: 7770596bc94613409fe2753240b603ae22a38b57
Parents: a96b016
Author: Rob Vesse <rv...@apache.org>
Authored: Fri Jun 26 16:15:18 2015 +0100
Committer: Rob Vesse <rv...@apache.org>
Committed: Fri Jun 26 16:31:05 2015 +0100
----------------------------------------------------------------------
apache-jena/bin/tdbloader2 | 59 +++++++++++-----
apache-jena/bin/tdbloader2data | 43 ++++++++++--
apache-jena/bin/tdbloader2index | 126 ++++++++++++++++++++++++++++++-----
3 files changed, 192 insertions(+), 36 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/jena/blob/7770596b/apache-jena/bin/tdbloader2
----------------------------------------------------------------------
diff --git a/apache-jena/bin/tdbloader2 b/apache-jena/bin/tdbloader2
index 9ff2727..9508031 100755
--- a/apache-jena/bin/tdbloader2
+++ b/apache-jena/bin/tdbloader2
@@ -49,6 +49,10 @@ Common additional options are as follows:
Advanced additional options are as follows:
+ -d
+ --debug
+ Enable debug mode, adds extra debug output
+
-k
--keep-work
Keeps the temporary work files around after they are no longer
@@ -65,6 +69,10 @@ Advanced additional options are as follows:
When no phase is specified it defaults to all
+ -t
+ --trace
+ Enable trace mode, essentially sets -x within the scripts
+
EOF
}
@@ -101,13 +109,12 @@ case "$(uname)" in
esac
export JENA_CP
-#echo $JENA_CP
-if [ -z "$SORT_ARGS" ]
-then
+# echo JENA_CP
+if [ -z "$SORT_ARGS" ]; then
SORT_ARGS="--buffer-size=50%"
- if $(sort --parallel=3 < /dev/null 2>/dev/null)
- then
- SORT_ARGS="$SORT_ARGS --parallel=3"
+ sort --parallel=3 < /dev/null 2>/dev/null
+ if [ $? = 0 ]; then
+ SORT_ARGS="$SORT_ARGS --parallel=3"
fi
fi
export SORT_ARGS
@@ -116,11 +123,23 @@ export SORT_ARGS
LOC=
PHASE=
KEEP_WORK=0
+DEBUG=0
+TRACE=0
while [ $# -gt 0 ]
do
ARG=$1
case "$ARG" in
+ -d|--debug)
+ # Debug Mode
+ shift
+ DEBUG=1
+ ;;
+ -h|--help)
+ # Help
+ printUsage
+ exit 0
+ ;;
-k|--keep-work)
# Keep work files
shift
@@ -143,10 +162,11 @@ do
PHASE="$1"
shift
;;
- -h|--help)
- # Help
- printUsage
- exit 0
+ -t|--trace)
+ # Trace mode
+ shift
+ TRACE=1
+ set -x
;;
*)
# Once we see an unrecognized argument treat as start of files to process
@@ -159,9 +179,15 @@ if [ -z "$PHASE" ]; then
PHASE="all"
fi
COMMON_ARGS=
-if [ $KEEP_WORK = 0 ]; then
+if [ $KEEP_WORK = 1 ]; then
COMMON_ARGS="--keep-work"
fi
+if [ $DEBUG = 1 ]; then
+ COMMON_ARGS="$COMMON_ARGS --debug"
+fi
+if [ $TRACE = 1 ]; then
+ COMMON_ARGS="$COMMON_ARGS --trace"
+fi
log() { echo " $(date $DATE)" "$@" ; }
@@ -172,16 +198,19 @@ DATE="+%H:%M:%S"
log "-- TDB Bulk Loader Start"
TIME1="$(date +%s)"
+TOOL_DIR=$JENA_HOME/bin/
+# DEV - Following is just for debugging
+TOOL_DIR=
case "$PHASE" in
all)
- exec "$JENA_HOME/bin/tdbloader2data" $COMMON_ARGS --loc "$LOC" "$@"
- exec "$JENA_HOME/bin/tdbloader2index" $COMMON_ARGS --loc "$LOC"
+ exec "${TOOL_DIR}tdbloader2data" $COMMON_ARGS --loc "$LOC" "$@"
+ exec "${TOOL_DIR}tdbloader2index" $COMMON_ARGS --loc "$LOC"
;;
data)
- exec "$JENA_HOME/bin/tdbloader2data" $COMMON_ARGS --loc "$LOC" "$@"
+ exec "${TOOL_DIR}tdbloader2data" $COMMON_ARGS --loc "$LOC" "$@"
;;
index)
- exec "$JENA_HOME/bin/tdbloader2index" $COMMON_ARGS --loc "$LOC"
+ exec "${TOOL_DIR}tdbloader2index" $COMMON_ARGS --loc "$LOC"
;;
*)
echo "Unrecognized phase $PHASE" 1>&2
http://git-wip-us.apache.org/repos/asf/jena/blob/7770596b/apache-jena/bin/tdbloader2data
----------------------------------------------------------------------
diff --git a/apache-jena/bin/tdbloader2data b/apache-jena/bin/tdbloader2data
index 5aceb27..efb590a 100755
--- a/apache-jena/bin/tdbloader2data
+++ b/apache-jena/bin/tdbloader2data
@@ -52,35 +52,58 @@ Common additional options are as follows:
Advanced additional options are as follows:
+ -d
+ --debug
+ Enable debug mode, adds extra debug output
+
-k
--keep-work
Keeps the temporary work files around after they are no longer
needed. May be useful for debugging.
+ -t
+ --trace
+ Enable trace mode, essentially sets -x within the scripts
+
EOF
}
# Exit on error.
set -e
-# Sort order is ASCII
-export LC_ALL="C"
-
log() { echo " $(date $DATE)" "$@" ; }
+function debug() {
+ if [ $DEBUG = 1 ]; then
+ log "DEBUG" "$@"
+ fi
+}
+
#DATE="+%Y-%m-%dT%H:%M:%S%:z"
DATE="+%H:%M:%S"
PKG=org.apache.jena.tdb.store.bulkloader2
+#DEV - Allows use against Jena 2 API
+PKG=com.hp.hpl.jena.tdb.store.bulkloader2
# Process Arguments
LOC=
KEEP_WORK=0
+DEBUG=0
while [ $# -gt 0 ]
do
ARG=$1
case "$ARG" in
+ -d|--debug)
+ # Debug Mode
+ shift
+ DEBUG=1
+ ;;
+ -h|--help)
+ printUsage
+ exit 0
+ ;;
-k|--keep-work)
# Keep work files
# This option is actually not used by this script but may be passed in
@@ -99,9 +122,10 @@ do
LOC=${ARG/-*loc=/}
shift
;;
- -h|--help)
- printUsage
- exit 0
+ -t|--trace)
+ # Trace mode
+ shift
+ set -x
;;
*)
# Any further arguments are treated as data files
@@ -124,18 +148,23 @@ fi
if [ ! -e "$LOC" ] ; then
# If non-existent try to create
+ debug "Trying to create new database directory: $LOC"
mkdir "$LOC"
if [ $? != 0 ]; then
echo "Failed to create new directory: $LOC"
exit 1
fi
+ debug "New database directory created: $LOC"
fi
if [ ! -d "$LOC" ] ; then echo "Location is not a directory: $LOC" ; exit 1 ; fi
+# TODO Make LOC absolute
+
FILES="$@"
## JVM Arguments
JVM_ARGS=${JVM_ARGS:--Xmx1200M}
+debug "JVM Arguments are $JVM_ARGS"
# Classpath set in "tdbloader2"
if [ -z "$JENA_CP" ]
@@ -151,6 +180,8 @@ log "Data Load Phase"
DATA_TRIPLES="$LOC/data-triples.tmp"
DATA_QUADS="$LOC/data-quads.tmp"
+debug "Data files are $DATA_TRIPLES and $DATA_QUADS"
+
java $JVM_ARGS -cp "$JENA_CP" "$PKG".CmdNodeTableBuilder \
"--loc=$LOC" "--triples=$DATA_TRIPLES" "--quads=$DATA_QUADS" $FILES
http://git-wip-us.apache.org/repos/asf/jena/blob/7770596b/apache-jena/bin/tdbloader2index
----------------------------------------------------------------------
diff --git a/apache-jena/bin/tdbloader2index b/apache-jena/bin/tdbloader2index
index 2730af1..971b824 100755
--- a/apache-jena/bin/tdbloader2index
+++ b/apache-jena/bin/tdbloader2index
@@ -49,11 +49,18 @@ Common additional options are as follows:
Advanced additional options are as follows:
+ -d
+ --debug
+ Enable debug mode, adds extra debug output
+
-k
--keep-work
Keeps the temporary work files around after they are no longer
needed. May be useful for debugging.
+ -t
+ --trace
+ Enable trace mode, essentially sets -x within the scripts
EOF
}
@@ -65,20 +72,45 @@ export LC_ALL="C"
log() { echo " $(date $DATE)" "$@" ; }
-TMP=$$
+function debug() {
+ if [ $DEBUG = 1 ]; then
+ log "DEBUG" "$@"
+ fi
+}
+
+function warn() {
+ log "WARN" "$@"
+}
+
+function getSize() {
+ ls -l $1 | awk '{print $5}'
+}
+
#DATE="+%Y-%m-%dT%H:%M:%S%:z"
DATE="+%H:%M:%S"
PKG=org.apache.jena.tdb.store.bulkloader2
+#DEV - Allows use against Jena 2 API
+PKG=com.hp.hpl.jena.tdb.store.bulkloader2
# Process Arguments
LOC=
KEEP_WORK=0
+DEBUG=0
while [ $# -gt 0 ]
do
ARG=$1
case "$ARG" in
+ -d|--debug)
+ # Debug Mode
+ shift
+ DEBUG=1
+ ;;
+ -h|--help)
+ printUsage
+ exit 0
+ ;;
-k|--keep-work)
# Keep work files
shift
@@ -95,9 +127,10 @@ do
LOC=${ARG/-*loc=/}
shift
;;
- -h|--help)
- printUsage
- exit 0
+ -t|--trace)
+ # Trace mode
+ shift
+ set -x
;;
*)
# Any further arguments are ignored
@@ -111,6 +144,8 @@ if [ -z "$LOC" ] ; then echo "No location specified" ; exit 1 ; fi
if [ ! -e "$LOC" ] ; then echo "Location specified does not exist: $LOC" ; exit 1; fi
if [ ! -d "$LOC" ] ; then echo "Location is not a directory: $LOC" ; exit 1 ; fi
+# TODO Make LOC absolute
+
DATA_TRIPLES="$LOC/data-triples.tmp"
DATA_QUADS="$LOC/data-quads.tmp"
@@ -123,9 +158,12 @@ if [ ! -e "$DATA_QUADS" ]; then
exit 1
fi
+debug "Data files are $DATA_TRIPLES and $DATA_QUADS"
+
##--parallel is not always available.
SORT_ARGS="${SORT_ARGS:---buffer-size=50%}"
JVM_ARGS=${JVM_ARGS:--Xmx1200M}
+debug "JVM Arguments are $JVM_ARGS"
# Classpath set in "tdbloader2"
if [ -z "$JENA_CP" ]
@@ -133,17 +171,57 @@ then
echo "Classpath not provided : set JENA_CP" 1>&2
exit 1
fi
+debug "Jena Classpath is $JENA_CP"
# ---- Index intermediates
## All files are writtern S P O / G S P O columns per row but in different sort orders.
log "Index Building Phase"
# Check whether Pipe Viewer is available
-# Needs to temporarily disable exit on error
+# Needs to temporarily disable exit on error as which produces an error
+# if the given command is not found
set +e
which pv >/dev/null 2>&1
HAS_PV=$?
set -e
+if [ $HAS_PV = 0 ]; then
+ debug "pv (Pipe Viewer) available on your system so sorts will show progres"
+else
+ debug "No pv (Pipe Viewer) on your system so sorts will show no progress"
+fi
+
+# Check where we are storing temporary sort files
+debug "Sort Arguments: $SORT_ARGS"
+SORT_TEMP_DIR=
+if [[ "$SORT_ARGS" == *"-T "* ]]; then
+ # Specified via -T argument
+ SORT_TEMP_DIR=(${SORT_ARGS/-T /})
+ SORT_TEMP_DIR=${SORT_TEMP_DIR[0]}
+elif [[ "$SORT_ARGS" == *"--temporary-directory="* ]]; then
+ # Specified via --temporary-directory argument
+ SORT_TEMP_DIR=(${SORT_ARGS/--temporary-directory=/})
+ SORT_TEMP_DIR=${SORT_TEMP_DIR[0]}
+else
+ # Using the system temp directory
+ SORT_TEMP_DIR="$TMPDIR"
+fi
+debug "Sort Temp Directory: $SORT_TEMP_DIR"
+
+# Find out how much space is on the sort directory
+SORT_DRIVE_INFO=$(df "$SORT_TEMP_DIR" | tail -n +2)
+SORT_DRIVE_DISK=$(echo $SORT_DRIVE_INFO | awk '{print $1}')
+SORT_DRIVE_FREE_SPACE=$(echo $SORT_DRIVE_INFO | awk '{print $4}')
+SORT_DRIVE_USED=$(echo $SORT_DRIVE_INFO | awk '{print $5}')
+SORT_DRIVE_FREE=${SORT_DRIVE_USED/"%"/}
+SORT_DRIVE_FREE=$((100 - $SORT_DRIVE_FREE))
+debug "Sort Temp Directory ${SORT_TEMP_DIR} is on disk ${SORT_DRIVE_DISK} which has ${SORT_DRIVE_FREE}% free space (${SORT_DRIVE_FREE_SPACE} bytes)"
+
+if [ $SORT_DRIVE_FREE -le 10 ]; then
+ echo
+ warn "Sort Temp Directory ${SORT_TEMP_DIR} is on disk ${SORT_DRIVE_DISK} which only has ${SORT_DRIVE_FREE}% free space (${SORT_DRIVE_FREE_SPACE} bytes) available"
+ warn "This may result in sort failures if the data to be indexed is large"
+ echo
+fi
generate_index()
{
@@ -152,35 +230,52 @@ generate_index()
local IDX=$3
local WORK="$LOC/$IDX-txt"
- if [ ! -s "$DATA" ]
- then
+ if [ ! -s "$DATA" ]; then
+ debug "Skipping Index $IDX as no relevant data to index"
return
fi
log "Creating Index $IDX"
# Sort the input data
- log " Sort $IDX"
+ log "Sort $IDX"
+ debug "Sorting $DATA into work file $WORK"
if [ $HAS_PV = 0 ]; then
# Use pv (pipe viewer) to monitor sort progress
# Note that progress data will only be seen if running in the foreground
- SIZE=$(du -k "$DATA" | cut -f 1)
+ # To report progress need to know size of input data
+ SIZE=$(getSize "$DATA")
+ debug "Size of data to be sorted is $SIZE bytes"
+
pv -c -N data < "$DATA" | sort $SORT_ARGS -u $KEYS | pv -c -N sort -s $SIZE > $WORK
+
+ # CAUTION
+ # If sort errors here then the piping through pv will stop us from seeing the error
+ # and we'll continue onwards
+ # Therefore we need to check that the output size is same as input size as this is
+ # the only way to tell if sort suceeded
+ OUTPUT_SIZE=$(getSize "$WORK")
+ debug "Size of sorted data is $OUTPUT_SIZE bytes"
+ if [ $SIZE != $OUTPUT_SIZE ]; then
+ log "Aborting due to sort error"
+ exit 1
+ fi
else
# Use sort without any progress monitoring
sort $SORT_ARGS -u $KEYS < "$DATA" > $WORK
fi
- log " Sort $IDX Completed"
+ log "Sort $IDX Completed"
# Build into an index
- log " Build $IDX"
+ log "Build $IDX"
rm -f "$LOC/$IDX.dat"
rm -f "$LOC/$IDX.idn"
java -cp "$JENA_CP" "$PKG".CmdIndexBuild "$LOC" "$IDX" "$WORK"
- log " Build $IDX Completed"
+ log "Build $IDX Completed"
# Remove work file unless keeping
- if [ $KEEP_WORK = 1 ]; then
+ if [ $KEEP_WORK = 0 ]; then
+ debug "Cleaning up work file $WORK"
rm "$WORK"
fi
}
@@ -211,6 +306,7 @@ generate_index "$K4 $K2 $K3 $K1" "$DATA_QUADS" OSPG
log "Index Building Phase Completed"
# ---- Clean up.
-if [ $KEEP_WORK = 1 ]; then
- rm -f "$DATA_TRIPLES" "$DATA_QUADS"
+if [ $KEEP_WORK = 0 ]; then
+ debug "Cleaning up data files $DATA_TRIPLES and $DATA_QUADS"
+ rm -f "$DATA_TRIPLES" "$DATA_QUADS"
fi