You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@jena.apache.org by rv...@apache.org on 2015/07/07 14:53:06 UTC

[07/18] jena git commit: Further improvements to tdbloader2 scripts (JENA-977)

Further improvements to tdbloader2 scripts (JENA-977)

- Auto-detection of JENA_HOME now exports it so it is visible to the
  child scripts
- Force making database directory path absolute and resolving any
  symbolic links in the path
- Additional checks in tdbloader2index to warn if sort is going to be
  external and it may run out of temporary disk space for the sort


Project: http://git-wip-us.apache.org/repos/asf/jena/repo
Commit: http://git-wip-us.apache.org/repos/asf/jena/commit/a7ac2797
Tree: http://git-wip-us.apache.org/repos/asf/jena/tree/a7ac2797
Diff: http://git-wip-us.apache.org/repos/asf/jena/diff/a7ac2797

Branch: refs/heads/master
Commit: a7ac2797856bf60476204b8997b5a5bf4cfa15c5
Parents: c55c1f7
Author: Rob Vesse <rv...@apache.org>
Authored: Tue Jun 30 13:44:29 2015 +0100
Committer: Rob Vesse <rv...@apache.org>
Committed: Tue Jun 30 13:44:29 2015 +0100

----------------------------------------------------------------------
 apache-jena/bin/tdbloader2       |   5 +-
 apache-jena/bin/tdbloader2common | 106 ++++++++++++++++++++++++++++++++++
 apache-jena/bin/tdbloader2data   |   9 ++-
 apache-jena/bin/tdbloader2index  |  39 +++++++++++--
 4 files changed, 152 insertions(+), 7 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/jena/blob/a7ac2797/apache-jena/bin/tdbloader2
----------------------------------------------------------------------
diff --git a/apache-jena/bin/tdbloader2 b/apache-jena/bin/tdbloader2
index b7a1af2..310ee66 100755
--- a/apache-jena/bin/tdbloader2
+++ b/apache-jena/bin/tdbloader2
@@ -78,10 +78,11 @@ EOF
 
 # If JENA_HOME is empty
 if [ -z "$JENA_HOME" ];	then
+  echo "JENA_HOME not set, attempting to locate JENA_HOME automatically"
   SCRIPT="$0"
   # Catch common issue: script has been symlinked
 	if [ -L "$SCRIPT" ]; then
-		SCRIPT="$(readlink "$0")"
+		SCRIPT="$(readlink -f "$0")"
 		# If link is relative
 		case "$SCRIPT" in
    			/*) ;; # fine
@@ -91,6 +92,8 @@ if [ -z "$JENA_HOME" ];	then
 
   # Work out root from script location
   JENA_HOME="$( cd "$( dirname "$SCRIPT" )/.." && pwd )"
+  export JENA_HOME
+  echo "Located JENA_HOME at ${JENA_HOME}"
 fi
 source "${JENA_HOME}/bin/tdbloader2common"
 

http://git-wip-us.apache.org/repos/asf/jena/blob/a7ac2797/apache-jena/bin/tdbloader2common
----------------------------------------------------------------------
diff --git a/apache-jena/bin/tdbloader2common b/apache-jena/bin/tdbloader2common
index beae115..2830545 100644
--- a/apache-jena/bin/tdbloader2common
+++ b/apache-jena/bin/tdbloader2common
@@ -79,6 +79,112 @@ function getDriveInfo() {
   echo ${INFO[@]}
 }
 
+function getFreeMem() {
+  set +e
+  local FREE_MEM=-1
+  case "$OSTYPE" in
+    darwin*)
+      # Have to get this from top
+      FREE_MEM=$(top -l 1 | grep PhysMem | awk '{print $6}')
+      FREE_MEM=${FREE_MEM%M}
+      FREE_MEM=$(($FREE_MEM * 1024 * 1024))
+      ;;
+    *)
+      # Try to use free if available
+      which free >/dev/null 2>&1
+      if [ $? -eq 0 ]; then
+        # Have free available
+        FREE_MEM=$(free -b)
+      fi
+      ;;
+  esac
+  set -e
+
+  echo "$FREE_MEM"
+}
+
+function resolveLink() {
+  local NAME=$1
+
+  if [ -L "$NAME" ]; then
+    case "$OSTYPE" in
+      darwin*|*BSB*|*BSD|BSD*)
+        # BSD style readlink behaves differently to GNU readlink
+        # Have to manually follow links
+        while [ -L "$NAME" ]; do
+          NAME=$(readlink "$NAME")
+        done
+        ;;
+      *)
+        # Assuming standard GNU readlink with -f for
+        # canonicalize
+        NAME=$(readlink -f "$NAME")
+        ;;
+    esac
+  fi
+
+  echo "$NAME"
+}
+
+function resolveLinks() {
+  local NAME=$1
+
+  if [ -L "$NAME" ]; then
+    NAME=$(resolveLink "$NAME")
+  elif [[ "$NAME" == *"/" ]]; then
+    # If the path ends in a / test -L will report false even
+    # if the path is actually a symbolic link
+    # So check if the name without the trailing / is a link and if
+    # so resolve it
+    if [ -L "${NAME%/}" ]; then
+      NAME=${NAME%/}
+      NAME=$(resolveLink "$NAME")
+    fi
+  fi
+  echo "$NAME"
+}
+
+function makeAbsolute() {
+  local NAME=$1
+
+  # Follow links
+  NAME=$(resolveLinks "$NAME")
+
+  # Put back trailing slash
+  # Do this before we make the path absolute or we'll absolutize wrong
+  if [ -d "$NAME" ]; then
+    if [[ "$NAME" != *"/" ]]; then
+      NAME="${NAME}/"
+    fi
+  fi
+
+  if [[ "$NAME" != "/"* ]]; then
+    # Now make absolute
+    case "$OSTYPE" in
+      darwin*|*BSB*|*BSD|BSD*)
+        # BSD style readlink does not support the -f for canonicalization
+        # so have to do this via cd, pwd and basename
+        local FILENAME=$(basename "$NAME")
+        NAME=$(cd $(dirname "$NAME"); pwd)
+        NAME="$NAME/$FILENAME"
+        ;;
+      *)
+        # Otherwise assume standard GNU readlink
+        NAME=$(readlink -f "$NAME")
+        ;;
+    esac
+
+    # Put back trailing slash
+    if [ -d "$NAME" ]; then
+      if [[ "$NAME" != *"/" ]]; then
+        NAME="${NAME}/"
+      fi
+    fi
+  fi
+
+  echo "$NAME"
+}
+
 #DATE="+%Y-%m-%dT%H:%M:%S%:z"
 DATE="+%H:%M:%S"
 

http://git-wip-us.apache.org/repos/asf/jena/blob/a7ac2797/apache-jena/bin/tdbloader2data
----------------------------------------------------------------------
diff --git a/apache-jena/bin/tdbloader2data b/apache-jena/bin/tdbloader2data
index 6904c83..2c48a50 100755
--- a/apache-jena/bin/tdbloader2data
+++ b/apache-jena/bin/tdbloader2data
@@ -134,6 +134,13 @@ if [ $# = 0 ]; then
   abort 1 "No data files specified"
 fi
 
+# Make LOC absolute
+ABS_LOC=$(makeAbsolute "$LOC")
+if [ "$ABS_LOC" != "$LOC" ]; then
+  LOC="$ABS_LOC"
+  debug "Absolute database location is $LOC"
+fi
+
 # Look for any index and data files in the directory.
 # Skip a possible configuration file
 if test -n "$(find "$LOC" -maxdepth 1 -type f ! -name 'this.*' -print -quit)"
@@ -154,8 +161,6 @@ if [ ! -d "$LOC" ]; then
   abort 1 "Location is not a directory: $LOC"
 fi
 
-# TODO Make LOC absolute
-
 FILES="$@"
 
 ## JVM Arguments

http://git-wip-us.apache.org/repos/asf/jena/blob/a7ac2797/apache-jena/bin/tdbloader2index
----------------------------------------------------------------------
diff --git a/apache-jena/bin/tdbloader2index b/apache-jena/bin/tdbloader2index
index 5de8d6a..15a5832 100755
--- a/apache-jena/bin/tdbloader2index
+++ b/apache-jena/bin/tdbloader2index
@@ -127,6 +127,15 @@ done
 if [ -z "$LOC" ]; then
   abort 1 "No location specified"
 fi
+
+# Make LOC absolute
+ABS_LOC=$(makeAbsolute "$LOC")
+if [ "$ABS_LOC" != "$LOC" ]; then
+  LOC="$ABS_LOC"
+  debug "Absolute database location is $LOC"
+fi
+
+# Check location
 if [ ! -e "$LOC" ]; then
   abort 1 "Location specified does not exist: $LOC"
 fi
@@ -134,8 +143,6 @@ if [ ! -d "$LOC" ]; then
   abort 1 "Location is not a directory: $LOC"
 fi
 
-# TODO Make LOC absolute
-
 DATA_TRIPLES="$LOC/data-triples.tmp"
 DATA_QUADS="$LOC/data-quads.tmp"
 
@@ -146,7 +153,7 @@ if [ ! -e "$DATA_QUADS" ]; then
   abort 1 "No quads text file found in location, please run the tdbloader2data script first"
 fi
 
-debug "Data files are $DATA_TRIPLES and $DATA_QUADS"
+debug "Data text files are $DATA_TRIPLES and $DATA_QUADS"
 
 ##--parallel is not always available.
 SORT_ARGS="${SORT_ARGS:---buffer-size=50%}"
@@ -160,7 +167,7 @@ fi
 debug "Jena Classpath is $JENA_CP"
 
 # ---- Index intermediates
-## All files are writtern S P O / G S P O columns per row but in different sort orders.
+## All files are written S P O / G S P O columns per row but in different sort orders.
 info "Index Building Phase"
 
 # Check whether Pipe Viewer is available
@@ -191,6 +198,7 @@ else
   # Using the system temp directory
   SORT_TEMP_DIR="$TMPDIR"
 fi
+SORT_TEMP_DIR=$(makeAbsolute "$SORT_TEMP_DIR")
 debug "Sort Temp Directory: $SORT_TEMP_DIR"
 SORT_DRIVE_INFO=($(getDriveInfo "${SORT_TEMP_DIR}"))
 debug "Sort Temp Directory ${DIR} is on disk ${SORT_DRIVE_INFO[0]} which has ${SORT_DRIVE_INFO[2]}% free space (${SORT_DRIVE_INFO[3]} bytes)"
@@ -221,13 +229,36 @@ generate_index()
     debug "Size of data to be sorted is $SIZE bytes"
 
     # Verify that we have enough space to sort the data
+
+    # Firstly check that the output disk has sufficient space
     local WORK_DRIVE_INFO=($(getDriveInfo "${WORK}"))
     if [ "${SIZE}" -ge "${WORK_DRIVE_INFO[3]}" ]; then
+      # If there is insufficient disk space then we can abort now
       abort 1 "Insufficient free space on database drive ${WORK_DRIVE_INFO[0]}, there are ${WORK_DRIVE_INFO[3]} bytes free but ${SIZE} bytes are required"
     else
       debug "Sufficient free space on database drive ${WORK_DRIVE_INFO[0]} to attempt sorting data file ${DATA} (${SIZE} bytes required from ${WORK_DRIVE_INFO[3]} bytes free)"
     fi
 
+    # Secondly check if there is enough space to sort in-memory or if sort may need to do an external sort
+    # We only issue warnings when the sort is likely to be external because there are various factors
+    # such as virtual memory and OS file caching that may complicate this
+    FREE_MEM=$(getFreeMem)
+    if [ "$FREE_MEM" -ge 0 ]; then
+      if [ "$SIZE" -ge "$FREE_MEM" ]; then
+        warn "Insufficient free memory to sort data in-memory, sort will need to perform an external sort using Temp Directory ${SORT_TEMP_DIR}"
+
+        # Check for disk space on temporary disk
+        SORT_DRIVE_INFO=($(getDriveInfo "${SORT_TEMP_DIR}"))
+        if [ "$SIZE" -ge "${SORT_DRIVE_INFO[3]}" ]; then
+          warn "There may be insufficient for sort to perform an external sort using Tempo Directory ${SORT_TEMP_DIR} (${SIZE} bytes required but only ${SORT_DRIVE_INFO[3]} bytes free)"
+        fi
+      else
+        debug "Should be sufficient free memory ($FREE_MEM bytes) for sort to be fully in-memory"
+      fi
+    else
+      warn "Unable to determine free memory on your OS, can't check whether sort will be in-memory or external sort using Temp Directory ${SORT_TEMP_DIR}"
+    fi
+
     # Sort the input data
     info "Sort $IDX"
     debug "Sorting $DATA into work file $WORK"