You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@jena.apache.org by rv...@apache.org on 2015/07/07 14:53:06 UTC
[07/18] jena git commit: Further improvements to tdbloader2 scripts
(JENA-977)
Further improvements to tdbloader2 scripts (JENA-977)
- Auto-detection of JENA_HOME now exports it so it is visible to the
child scripts
- Force making database directory path absolute and resolving any
symbolic links in the path
- Additional checks in tdbloader2index to warn if sort is going to be
external and it may run out of temporary disk space for the sort
Project: http://git-wip-us.apache.org/repos/asf/jena/repo
Commit: http://git-wip-us.apache.org/repos/asf/jena/commit/a7ac2797
Tree: http://git-wip-us.apache.org/repos/asf/jena/tree/a7ac2797
Diff: http://git-wip-us.apache.org/repos/asf/jena/diff/a7ac2797
Branch: refs/heads/master
Commit: a7ac2797856bf60476204b8997b5a5bf4cfa15c5
Parents: c55c1f7
Author: Rob Vesse <rv...@apache.org>
Authored: Tue Jun 30 13:44:29 2015 +0100
Committer: Rob Vesse <rv...@apache.org>
Committed: Tue Jun 30 13:44:29 2015 +0100
----------------------------------------------------------------------
apache-jena/bin/tdbloader2 | 5 +-
apache-jena/bin/tdbloader2common | 106 ++++++++++++++++++++++++++++++++++
apache-jena/bin/tdbloader2data | 9 ++-
apache-jena/bin/tdbloader2index | 39 +++++++++++--
4 files changed, 152 insertions(+), 7 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/jena/blob/a7ac2797/apache-jena/bin/tdbloader2
----------------------------------------------------------------------
diff --git a/apache-jena/bin/tdbloader2 b/apache-jena/bin/tdbloader2
index b7a1af2..310ee66 100755
--- a/apache-jena/bin/tdbloader2
+++ b/apache-jena/bin/tdbloader2
@@ -78,10 +78,11 @@ EOF
# If JENA_HOME is empty
if [ -z "$JENA_HOME" ]; then
+ echo "JENA_HOME not set, attempting to locate JENA_HOME automatically"
SCRIPT="$0"
# Catch common issue: script has been symlinked
if [ -L "$SCRIPT" ]; then
- SCRIPT="$(readlink "$0")"
+ SCRIPT="$(readlink -f "$0")"
# If link is relative
case "$SCRIPT" in
/*) ;; # fine
@@ -91,6 +92,8 @@ if [ -z "$JENA_HOME" ]; then
# Work out root from script location
JENA_HOME="$( cd "$( dirname "$SCRIPT" )/.." && pwd )"
+ export JENA_HOME
+ echo "Located JENA_HOME at ${JENA_HOME}"
fi
source "${JENA_HOME}/bin/tdbloader2common"
http://git-wip-us.apache.org/repos/asf/jena/blob/a7ac2797/apache-jena/bin/tdbloader2common
----------------------------------------------------------------------
diff --git a/apache-jena/bin/tdbloader2common b/apache-jena/bin/tdbloader2common
index beae115..2830545 100644
--- a/apache-jena/bin/tdbloader2common
+++ b/apache-jena/bin/tdbloader2common
@@ -79,6 +79,112 @@ function getDriveInfo() {
echo ${INFO[@]}
}
+function getFreeMem() {
+ set +e
+ local FREE_MEM=-1
+ case "$OSTYPE" in
+ darwin*)
+ # Have to get this from top
+ FREE_MEM=$(top -l 1 | grep PhysMem | awk '{print $6}')
+ FREE_MEM=${FREE_MEM%M}
+ FREE_MEM=$(($FREE_MEM * 1024 * 1024))
+ ;;
+ *)
+ # Try to use free if available
+ which free >/dev/null 2>&1
+ if [ $? -eq 0 ]; then
+ # Have free available
+ FREE_MEM=$(free -b)
+ fi
+ ;;
+ esac
+ set -e
+
+ echo "$FREE_MEM"
+}
+
+function resolveLink() {
+ local NAME=$1
+
+ if [ -L "$NAME" ]; then
+ case "$OSTYPE" in
+ darwin*|*BSB*|*BSD|BSD*)
+ # BSD style readlink behaves differently to GNU readlink
+ # Have to manually follow links
+ while [ -L "$NAME" ]; do
+ NAME=$(readlink "$NAME")
+ done
+ ;;
+ *)
+ # Assuming standard GNU readlink with -f for
+ # canonicalize
+ NAME=$(readlink -f "$NAME")
+ ;;
+ esac
+ fi
+
+ echo "$NAME"
+}
+
+function resolveLinks() {
+ local NAME=$1
+
+ if [ -L "$NAME" ]; then
+ NAME=$(resolveLink "$NAME")
+ elif [[ "$NAME" == *"/" ]]; then
+ # If the path ends in a / test -L will report false even
+ # if the path is actually a symbolic link
+ # So check if the name without the trailing / is a link and if
+ # so resolve it
+ if [ -L "${NAME%/}" ]; then
+ NAME=${NAME%/}
+ NAME=$(resolveLink "$NAME")
+ fi
+ fi
+ echo "$NAME"
+}
+
+function makeAbsolute() {
+ local NAME=$1
+
+ # Follow links
+ NAME=$(resolveLinks "$NAME")
+
+ # Put back trailing slash
+ # Do this before we make the path absolute or we'll absolutize wrong
+ if [ -d "$NAME" ]; then
+ if [[ "$NAME" != *"/" ]]; then
+ NAME="${NAME}/"
+ fi
+ fi
+
+ if [[ "$NAME" != "/"* ]]; then
+ # Now make absolute
+ case "$OSTYPE" in
+ darwin*|*BSB*|*BSD|BSD*)
+ # BSD style readlink does not support the -f for canonicalization
+ # so have to do this via cd, pwd and basename
+ local FILENAME=$(basename "$NAME")
+ NAME=$(cd $(dirname "$NAME"); pwd)
+ NAME="$NAME/$FILENAME"
+ ;;
+ *)
+ # Otherwise assume standard GNU readlink
+ NAME=$(readlink -f "$NAME")
+ ;;
+ esac
+
+ # Put back trailing slash
+ if [ -d "$NAME" ]; then
+ if [[ "$NAME" != *"/" ]]; then
+ NAME="${NAME}/"
+ fi
+ fi
+ fi
+
+ echo "$NAME"
+}
+
#DATE="+%Y-%m-%dT%H:%M:%S%:z"
DATE="+%H:%M:%S"
http://git-wip-us.apache.org/repos/asf/jena/blob/a7ac2797/apache-jena/bin/tdbloader2data
----------------------------------------------------------------------
diff --git a/apache-jena/bin/tdbloader2data b/apache-jena/bin/tdbloader2data
index 6904c83..2c48a50 100755
--- a/apache-jena/bin/tdbloader2data
+++ b/apache-jena/bin/tdbloader2data
@@ -134,6 +134,13 @@ if [ $# = 0 ]; then
abort 1 "No data files specified"
fi
+# Make LOC absolute
+ABS_LOC=$(makeAbsolute "$LOC")
+if [ "$ABS_LOC" != "$LOC" ]; then
+ LOC="$ABS_LOC"
+ debug "Absolute database location is $LOC"
+fi
+
# Look for any index and data files in the directory.
# Skip a possible configuration file
if test -n "$(find "$LOC" -maxdepth 1 -type f ! -name 'this.*' -print -quit)"
@@ -154,8 +161,6 @@ if [ ! -d "$LOC" ]; then
abort 1 "Location is not a directory: $LOC"
fi
-# TODO Make LOC absolute
-
FILES="$@"
## JVM Arguments
http://git-wip-us.apache.org/repos/asf/jena/blob/a7ac2797/apache-jena/bin/tdbloader2index
----------------------------------------------------------------------
diff --git a/apache-jena/bin/tdbloader2index b/apache-jena/bin/tdbloader2index
index 5de8d6a..15a5832 100755
--- a/apache-jena/bin/tdbloader2index
+++ b/apache-jena/bin/tdbloader2index
@@ -127,6 +127,15 @@ done
if [ -z "$LOC" ]; then
abort 1 "No location specified"
fi
+
+# Make LOC absolute
+ABS_LOC=$(makeAbsolute "$LOC")
+if [ "$ABS_LOC" != "$LOC" ]; then
+ LOC="$ABS_LOC"
+ debug "Absolute database location is $LOC"
+fi
+
+# Check location
if [ ! -e "$LOC" ]; then
abort 1 "Location specified does not exist: $LOC"
fi
@@ -134,8 +143,6 @@ if [ ! -d "$LOC" ]; then
abort 1 "Location is not a directory: $LOC"
fi
-# TODO Make LOC absolute
-
DATA_TRIPLES="$LOC/data-triples.tmp"
DATA_QUADS="$LOC/data-quads.tmp"
@@ -146,7 +153,7 @@ if [ ! -e "$DATA_QUADS" ]; then
abort 1 "No quads text file found in location, please run the tdbloader2data script first"
fi
-debug "Data files are $DATA_TRIPLES and $DATA_QUADS"
+debug "Data text files are $DATA_TRIPLES and $DATA_QUADS"
##--parallel is not always available.
SORT_ARGS="${SORT_ARGS:---buffer-size=50%}"
@@ -160,7 +167,7 @@ fi
debug "Jena Classpath is $JENA_CP"
# ---- Index intermediates
-## All files are writtern S P O / G S P O columns per row but in different sort orders.
+## All files are written S P O / G S P O columns per row but in different sort orders.
info "Index Building Phase"
# Check whether Pipe Viewer is available
@@ -191,6 +198,7 @@ else
# Using the system temp directory
SORT_TEMP_DIR="$TMPDIR"
fi
+SORT_TEMP_DIR=$(makeAbsolute "$SORT_TEMP_DIR")
debug "Sort Temp Directory: $SORT_TEMP_DIR"
SORT_DRIVE_INFO=($(getDriveInfo "${SORT_TEMP_DIR}"))
debug "Sort Temp Directory ${DIR} is on disk ${SORT_DRIVE_INFO[0]} which has ${SORT_DRIVE_INFO[2]}% free space (${SORT_DRIVE_INFO[3]} bytes)"
@@ -221,13 +229,36 @@ generate_index()
debug "Size of data to be sorted is $SIZE bytes"
# Verify that we have enough space to sort the data
+
+ # Firstly check that the output disk has sufficient space
local WORK_DRIVE_INFO=($(getDriveInfo "${WORK}"))
if [ "${SIZE}" -ge "${WORK_DRIVE_INFO[3]}" ]; then
+ # If there is insufficient disk space then we can abort now
abort 1 "Insufficient free space on database drive ${WORK_DRIVE_INFO[0]}, there are ${WORK_DRIVE_INFO[3]} bytes free but ${SIZE} bytes are required"
else
debug "Sufficient free space on database drive ${WORK_DRIVE_INFO[0]} to attempt sorting data file ${DATA} (${SIZE} bytes required from ${WORK_DRIVE_INFO[3]} bytes free)"
fi
+ # Secondly check if there is enough space to sort in-memory or if sort may need to do an external sort
+ # We only issue warnings when the sort is likely to be external because there are various factors
+ # such as virtual memory and OS file caching that may complicate this
+ FREE_MEM=$(getFreeMem)
+ if [ "$FREE_MEM" -ge 0 ]; then
+ if [ "$SIZE" -ge "$FREE_MEM" ]; then
+ warn "Insufficient free memory to sort data in-memory, sort will need to perform an external sort using Temp Directory ${SORT_TEMP_DIR}"
+
+ # Check for disk space on temporary disk
+ SORT_DRIVE_INFO=($(getDriveInfo "${SORT_TEMP_DIR}"))
+ if [ "$SIZE" -ge "${SORT_DRIVE_INFO[3]}" ]; then
+ warn "There may be insufficient for sort to perform an external sort using Tempo Directory ${SORT_TEMP_DIR} (${SIZE} bytes required but only ${SORT_DRIVE_INFO[3]} bytes free)"
+ fi
+ else
+ debug "Should be sufficient free memory ($FREE_MEM bytes) for sort to be fully in-memory"
+ fi
+ else
+ warn "Unable to determine free memory on your OS, can't check whether sort will be in-memory or external sort using Temp Directory ${SORT_TEMP_DIR}"
+ fi
+
# Sort the input data
info "Sort $IDX"
debug "Sorting $DATA into work file $WORK"