You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@jena.apache.org by rv...@apache.org on 2015/06/26 17:32:18 UTC

[1/4] jena git commit: Initial work on refactoring tdbloader2 scripts (JENA-977)

Repository: jena
Updated Branches:
  refs/heads/JENA-977 [created] 7770596bc


Initial work on refactoring tdbloader2 scripts (JENA-977)

- Better option processing
- Split tdbloader2worker into a data and index phase script
- Support only running a specific phase


Project: http://git-wip-us.apache.org/repos/asf/jena/repo
Commit: http://git-wip-us.apache.org/repos/asf/jena/commit/d92e3362
Tree: http://git-wip-us.apache.org/repos/asf/jena/tree/d92e3362
Diff: http://git-wip-us.apache.org/repos/asf/jena/diff/d92e3362

Branch: refs/heads/JENA-977
Commit: d92e336263da3f0f2a58dfc24cb9b5f23449cc5c
Parents: 13855a6
Author: Rob Vesse <rv...@apache.org>
Authored: Thu Jun 25 16:56:29 2015 +0100
Committer: Rob Vesse <rv...@apache.org>
Committed: Fri Jun 26 16:30:15 2015 +0100

----------------------------------------------------------------------
 apache-jena/bin/tdbloader2      |  72 +++++++++++++++-
 apache-jena/bin/tdbloader2data  | 107 ++++++++++++++++++++++++
 apache-jena/bin/tdbloader2index | 155 +++++++++++++++++++++++++++++++++++
 3 files changed, 333 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/jena/blob/d92e3362/apache-jena/bin/tdbloader2
----------------------------------------------------------------------
diff --git a/apache-jena/bin/tdbloader2 b/apache-jena/bin/tdbloader2
index c081074..37cc874 100755
--- a/apache-jena/bin/tdbloader2
+++ b/apache-jena/bin/tdbloader2
@@ -48,6 +48,7 @@ case "$(uname)" in
 esac
 
 export JENA_CP
+echo $JENA_CP
 if [ -z "$SORT_ARGS" ]
 then
     SORT_ARGS="--buffer-size=50%"
@@ -58,4 +59,73 @@ then
 fi
 export SORT_ARGS
 
-exec "$JENA_HOME/bin/tdbloader2worker" "$@"
+# Process arguments
+LOC=
+PHASE=
+
+while [ $# -gt 0 ]
+do
+  ARG=$1
+  case "$ARG" in
+    --loc|-loc)
+      # Location space separated
+      shift
+      LOC="$1"
+      shift
+      ;;
+    -*loc=*)
+      # Location = separated
+      LOC=${ARG/-*loc=/}
+      shift
+      ;;
+    --phase)
+      # Phase space separated
+      shift
+      PHASE="$1"
+      shift
+      ;;
+    *)
+      # Once we see an unrecognized argument treat as start of files to process
+      break
+      ;;
+  esac
+done
+
+if [ -z "$PHASE" ]; then
+  PHASE="all"
+fi
+
+echo "Location is '$LOC'"
+echo "Phase is '$PHASE'"
+
+log() { echo " $(date $DATE)" "$@" ; }
+
+#DATE="+%Y-%m-%dT%H:%M:%S%:z"
+DATE="+%H:%M:%S"
+
+# ---- Start
+log "-- TDB Bulk Loader Start"
+TIME1="$(date +%s)"
+
+case "$PHASE" in
+  all)
+    exec "$JENA_HOME/bin/tdbloader2data" --loc "$LOC" "$@"
+    exec "$JENA_HOME/bin/tdbloader2index" --loc "$LOC"
+    ;;
+  data)
+    exec "$JENA_HOME/bin/tdbloader2data" --loc "$LOC" "$@"
+    ;;
+  index)
+    exec "$JENA_HOME/bin/tdbloader2index" --loc "$LOC"
+    ;;
+  *)
+    echo "Unrecognized phase $PHASE" 1>&2
+    exit 1
+    ;;
+esac
+
+# ---- End
+TIME2="$(date +%s)"
+log "-- TDB Bulk Loader Finish"
+ELAPSED=$(($TIME2-$TIME1))
+log "-- $ELAPSED seconds"
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/jena/blob/d92e3362/apache-jena/bin/tdbloader2data
----------------------------------------------------------------------
diff --git a/apache-jena/bin/tdbloader2data b/apache-jena/bin/tdbloader2data
new file mode 100755
index 0000000..90200e4
--- /dev/null
+++ b/apache-jena/bin/tdbloader2data
@@ -0,0 +1,107 @@
+#!/usr/bin/env bash
+
+## Licensed to the Apache Software Foundation (ASF) under one
+## or more contributor license agreements.  See the NOTICE file
+## distributed with this work for additional information
+## regarding copyright ownership.  The ASF licenses this file
+## to you under the Apache License, Version 2.0 (the
+## "License"); you may not use this file except in compliance
+## with the License.  You may obtain a copy of the License at
+##
+##     http://www.apache.org/licenses/LICENSE-2.0
+##
+## Unless required by applicable law or agreed to in writing, software
+## distributed under the License is distributed on an "AS IS" BASIS,
+## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+## See the License for the specific language governing permissions and
+## limitations under the License.
+
+# The environment for this sub-script is setup by "tdbloader2"
+
+# Exit on error.
+set -e
+
+# Sort order is ASCII
+export LC_ALL="C"
+
+log() { echo " $(date $DATE)" "$@" ; }
+
+#DATE="+%Y-%m-%dT%H:%M:%S%:z"
+DATE="+%H:%M:%S"
+
+## JVM Arguments
+JVM_ARGS=${JVM_ARGS:--Xmx1200M}
+
+# Classpath set in "tdbloader2"
+if [ -z "$JENA_CP" ]
+then
+    echo "Classpath not provided : set JENA_CP" 1>&2
+    exit 1
+fi
+
+USAGE="Usage: tdbloader2data --loc location datafile ..."
+PKG=org.apache.jena.tdb.store.bulkloader2
+
+while [ $# -gt 0 ]
+do
+  ARG=$1
+  case "$ARG" in
+    --loc|-loc)
+      # Location space separated
+      shift
+      LOC="$1"
+      shift
+      ;;
+    -*loc=*)
+      # Location = separated
+      LOC=${ARG/-*loc=/}
+      shift
+      ;;
+    --help)
+      echo $USAGE
+      exit 0
+      ;;
+    *)
+      # Any further arguments are treated as data files
+      break
+      ;;
+  esac
+done
+
+# Verify arguments
+if [ -z "$LOC" ] ; then echo "No location specified" ; exit 1 ; fi
+if [ $# = 0 ]; then echo "No data files specified" ; exit 1 ; fi
+
+# Look for any index and data files in the directory.
+# Skip a possible configuration file
+if test -n "$(find "$LOC" -maxdepth 1 -type f ! -name 'this.*' -print -quit)"
+then 
+    echo "Location is not empty: $LOC"
+    exit 1
+fi
+
+if [ ! -e "$LOC" ] ; then
+  # If non-existent try to create
+  mkdir "$LOC"
+  if [ $? != 0 ]; then
+    echo "Failed to create new directory: $LOC"
+    exit 1
+  fi
+fi
+if [ ! -d "$LOC" ] ; then echo "Location is not a directory: $LOC" ; exit 1 ; fi
+
+FILES="$@"
+## Stdin?
+KEEPWORKFILES="${KEEPWORKFILES:-}"
+
+# ---- Data loading phase
+log "Data Load Phase"
+# Produce nodes file and triples/quads text file.
+
+DATA_TRIPLES="$LOC/data-triples.tmp"
+DATA_QUADS="$LOC/data-quads.tmp"
+
+java $JVM_ARGS -cp "$JENA_CP" "$PKG".CmdNodeTableBuilder \
+    "--loc=$LOC" "--triples=$DATA_TRIPLES" "--quads=$DATA_QUADS" $FILES
+
+log "Data Load Phase Completed"

http://git-wip-us.apache.org/repos/asf/jena/blob/d92e3362/apache-jena/bin/tdbloader2index
----------------------------------------------------------------------
diff --git a/apache-jena/bin/tdbloader2index b/apache-jena/bin/tdbloader2index
new file mode 100755
index 0000000..372aa5c
--- /dev/null
+++ b/apache-jena/bin/tdbloader2index
@@ -0,0 +1,155 @@
+#!/usr/bin/env bash
+
+## Licensed to the Apache Software Foundation (ASF) under one
+## or more contributor license agreements.  See the NOTICE file
+## distributed with this work for additional information
+## regarding copyright ownership.  The ASF licenses this file
+## to you under the Apache License, Version 2.0 (the
+## "License"); you may not use this file except in compliance
+## with the License.  You may obtain a copy of the License at
+##
+##     http://www.apache.org/licenses/LICENSE-2.0
+##
+## Unless required by applicable law or agreed to in writing, software
+## distributed under the License is distributed on an "AS IS" BASIS,
+## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+## See the License for the specific language governing permissions and
+## limitations under the License.
+
+# The environment for this sub-script is setup by "tdbloader2"
+
+# Exit on error.
+set -e
+
+# Sort order is ASCII
+export LC_ALL="C"
+
+log() { echo " $(date $DATE)" "$@" ; }
+
+TMP=$$
+#DATE="+%Y-%m-%dT%H:%M:%S%:z"
+DATE="+%H:%M:%S"
+
+##--parallel is not always available.
+SORT_ARGS="${SORT_ARGS:---buffer-size=50%}"
+JVM_ARGS=${JVM_ARGS:--Xmx1200M}
+
+# Classpath set in "tdbloader2"
+if [ -z "$JENA_CP" ]
+then
+    echo "Classpath not provided : set JENA_CP" 1>&2
+    exit 1
+fi
+
+USAGE="Usage: tdbloader2index --loc location"
+PKG=org.apache.jena.tdb.store.bulkloader2
+
+while [ $# -gt 0 ]
+do
+  ARG=$1
+  case "$ARG" in
+    --loc|-loc)
+      # Location space separated
+      shift
+      LOC="$1"
+      shift
+      ;;
+    -*loc=*)
+      # Location = separated
+      LOC=${ARG/-*loc=/}
+      shift
+      ;;
+    --help)
+      echo $USAGE
+      exit 0
+      ;;
+    *)
+      # Any further arguments are ignored
+      break
+      ;;
+  esac
+done
+
+# Verify arguments
+if [ -z "$LOC" ] ; then echo "No location specified" ; exit 1 ; fi
+if [ ! -e "$LOC" ] ; then echo "Location specified does not exist: $LOC" ; exit 1; fi
+if [ ! -d "$LOC" ] ; then echo "Location is not a directory: $LOC" ; exit 1 ; fi
+
+KEEPWORKFILES="${KEEPWORKFILES:-}"
+
+DATA_TRIPLES="$LOC/data-triples.tmp"
+DATA_QUADS="$LOC/data-quads.tmp"
+
+# ---- Index intermediates
+## All files are writtern S P O / G S P O columns per row but in different sort orders.
+log "Index Building Phase"
+
+which pv >/dev/null 2>&1
+HAS_PV=$?
+
+process_rows()
+{
+    local KEYS="$1"
+    local DATA="$2"
+    local IDX=$3
+    local WORK="$LOC/$IDX-txt"
+
+    if [ ! -s "$DATA" ]
+    then
+	    return
+	  fi
+
+    log "Creating Index $IDX"
+    log "  Sort $IDX"
+    if [ $HAS_PV = 0 ]; then
+      # Use pv (pipe viewer) to monitor sort progress
+      # Note that progress data will only be seen if running in the foreground
+      SIZE=$(du -k "$DATA" | cut -f 1)
+      pv -c -N data < "$DATA" | sort $SORT_ARGS -u $KEYS | pv -c -N sort -s $SIZE > $WORK
+    else
+      # Use sort without any progress monitoring
+      sort $SORT_ARGS -u $KEYS < "$DATA" > $WORK
+    fi
+    log "  Sort $IDX Completed"
+    log "  Build $IDX"
+    rm -f "$LOC/$IDX.dat"
+    rm -f "$LOC/$IDX.idn"
+    java -cp "$JENA_CP" "$PKG".CmdIndexBuild "$LOC" "$IDX" "$WORK"
+    log "  Build $IDX Completed"
+    # Remove intermediary file.
+    if [ "$KEEPWORKFILES" != "yes" ] 
+    then
+	    rm "$WORK"
+    fi
+}
+
+K1="-k 1,1"
+K2="-k 2,2"
+K3="-k 3,3"
+K4="-k 4,4"
+
+process_rows "$K1 $K2 $K3" "$DATA_TRIPLES" SPO
+
+process_rows "$K2 $K3 $K1" "$DATA_TRIPLES" POS
+
+process_rows "$K3 $K1 $K2" "$DATA_TRIPLES" OSP
+
+process_rows "$K1 $K2 $K3 $K4" "$DATA_QUADS" GSPO
+
+process_rows "$K1 $K3 $K4 $K2" "$DATA_QUADS" GPOS
+
+process_rows "$K1 $K4 $K2 $K3" "$DATA_QUADS" GOSP
+
+process_rows "$K2 $K3 $K4 $K1" "$DATA_QUADS" SPOG
+
+process_rows "$K3 $K4 $K2 $K1" "$DATA_QUADS" POSG
+
+process_rows "$K4 $K2 $K3 $K1" "$DATA_QUADS" OSPG
+
+log "Index Building Phase Completed"
+
+# ---- Clean up.
+if [ "$KEEPWORKFILES" != "yes" ] 
+then
+    rm -f "$DATA_TRIPLES" "$DATA_QUADS" 
+fi


[3/4] jena git commit: Further refactoring of tdbloader2 scripts (JENA-977)

Posted by rv...@apache.org.
Further refactoring of tdbloader2 scripts (JENA-977)

- Proper usage summaries in all scripts
- -k/--keep-work option instead of hidden environment variable
  for keeping work
- Short forms for all options


Project: http://git-wip-us.apache.org/repos/asf/jena/repo
Commit: http://git-wip-us.apache.org/repos/asf/jena/commit/a96b0164
Tree: http://git-wip-us.apache.org/repos/asf/jena/tree/a96b0164
Diff: http://git-wip-us.apache.org/repos/asf/jena/diff/a96b0164

Branch: refs/heads/JENA-977
Commit: a96b0164c43142791ac030e5332b3f54df6fb4ba
Parents: 7b61a14
Author: Rob Vesse <rv...@apache.org>
Authored: Fri Jun 26 12:25:57 2015 +0100
Committer: Rob Vesse <rv...@apache.org>
Committed: Fri Jun 26 16:30:53 2015 +0100

----------------------------------------------------------------------
 apache-jena/bin/tdbloader2      |  72 ++++++++++++++++------
 apache-jena/bin/tdbloader2data  |  82 ++++++++++++++++++++-----
 apache-jena/bin/tdbloader2index | 116 +++++++++++++++++++++++++----------
 3 files changed, 204 insertions(+), 66 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/jena/blob/a96b0164/apache-jena/bin/tdbloader2
----------------------------------------------------------------------
diff --git a/apache-jena/bin/tdbloader2 b/apache-jena/bin/tdbloader2
index 34ee029..9ff2727 100755
--- a/apache-jena/bin/tdbloader2
+++ b/apache-jena/bin/tdbloader2
@@ -17,24 +17,53 @@
 
 function printUsage() {
   cat << EOF
-Usage: tdbloader2 <Options> <Data>
+tdbloader2 - TDB Bulk Loader
 
-Options are as follows:
+Usage: tdbloader2 --loc <Directory> [Options] <Data> ...
 
+Bulk loader for TDB which manipulates the data files directly and so
+can only be used to create new databases.  This command relies on
+POSIX utilities so will only work on POSIX operating systems.
+
+If you wish to bulk load to an existing database please use tdbloader
+instead.
+
+Required options are as follows:
+
+  -l <DatabaseDirectory>
+  --loc <DatabaseDirectory>
+    Sets the location in which the database should be created.
+
+    This location must be a directory and must be empty, if a
+    non-existent path is specified it will be created as a new
+    directory.
+
+  <Data>
+    Specifies the path to one/more data files to load
+
+Common additional options are as follows:
+
+  -h
   --help
     Prints this help summary and exits
 
-  --loc <DatabaseDirectory>
-    Sets the location in which the database should be created
+Advanced additional options are as follows:
 
+  -k
+  --keep-work
+    Keeps the temporary work files around after they are no longer
+    needed.  May be useful for debugging.
+
+  -p <Phase>
   --phase <Phase>
     Sets the phase of the build to run, supported values are:
 
-      all    Full bulk load
-      data   Data phase only
-      index  Index phase only, requires the data phase to previously have been run
+      all      Full bulk load
+      data     Data phase only
+      index    Index phase only, requires the data phase to
+               previously have been run
 
-    When not specified defaults to all
+    When no phase is specified it defaults to all
 
 EOF
 }
@@ -86,12 +115,18 @@ export SORT_ARGS
 # Process arguments
 LOC=
 PHASE=
+KEEP_WORK=0
 
 while [ $# -gt 0 ]
 do
   ARG=$1
   case "$ARG" in
-    --loc|-loc)
+    -k|--keep-work)
+      # Keep work files
+      shift
+      KEEP_WORK=1
+      ;;
+    -l|--loc|-loc)
       # Location space separated
       shift
       LOC="$1"
@@ -102,13 +137,13 @@ do
       LOC=${ARG/-*loc=/}
       shift
       ;;
-    --phase)
+    -p|--phase)
       # Phase space separated
       shift
       PHASE="$1"
       shift
       ;;
-    --help)
+    -h|--help)
       # Help
       printUsage
       exit 0
@@ -123,9 +158,10 @@ done
 if [ -z "$PHASE" ]; then
   PHASE="all"
 fi
-
-#echo "Location is '$LOC'"
-#echo "Phase is '$PHASE'"
+COMMON_ARGS=
+if [ $KEEP_WORK = 0 ]; then
+  COMMON_ARGS="--keep-work"
+fi
 
 log() { echo " $(date $DATE)" "$@" ; }
 
@@ -138,14 +174,14 @@ TIME1="$(date +%s)"
 
 case "$PHASE" in
   all)
-    exec "$JENA_HOME/bin/tdbloader2data" --loc "$LOC" "$@"
-    exec "$JENA_HOME/bin/tdbloader2index" --loc "$LOC"
+    exec "$JENA_HOME/bin/tdbloader2data" $COMMON_ARGS --loc "$LOC" "$@"
+    exec "$JENA_HOME/bin/tdbloader2index" $COMMON_ARGS --loc "$LOC"
     ;;
   data)
-    exec "$JENA_HOME/bin/tdbloader2data" --loc "$LOC" "$@"
+    exec "$JENA_HOME/bin/tdbloader2data" $COMMON_ARGS --loc "$LOC" "$@"
     ;;
   index)
-    exec "$JENA_HOME/bin/tdbloader2index" --loc "$LOC"
+    exec "$JENA_HOME/bin/tdbloader2index" $COMMON_ARGS --loc "$LOC"
     ;;
   *)
     echo "Unrecognized phase $PHASE" 1>&2

http://git-wip-us.apache.org/repos/asf/jena/blob/a96b0164/apache-jena/bin/tdbloader2data
----------------------------------------------------------------------
diff --git a/apache-jena/bin/tdbloader2data b/apache-jena/bin/tdbloader2data
index 90200e4..5aceb27 100755
--- a/apache-jena/bin/tdbloader2data
+++ b/apache-jena/bin/tdbloader2data
@@ -18,6 +18,48 @@
 
 # The environment for this sub-script is setup by "tdbloader2"
 
+function printUsage() {
+  cat << EOF
+tdbloader2data - TDB Bulk Loader - Data Phase
+
+Usage tdbloader2data --loc <Directory> [Options] <Data> ...
+
+Bulk Loader for TDB which generates the Node Table.  This command
+relies on POSIX utilities so will only work on POSIX operating
+systems.
+
+This command can only be used to create new database. If you wish to
+bulk load to an existing database please use tdbloader instead.
+
+Required options are as follows:
+
+  -l <DatabaseDirectory>
+  --loc <DatabaseDirectory>
+    Sets the location in which the database should be created.
+
+    This location must be a directory and must be empty, if a
+    non-existent path is specified it will be created as a new
+    directory.
+
+  <Data>
+    Specifies the path to one/more data files to load
+
+Common additional options are as follows:
+
+  -h
+  --help
+    Prints this help summary and exits
+
+Advanced additional options are as follows:
+
+  -k
+  --keep-work
+    Keeps the temporary work files around after they are no longer
+    needed.  May be useful for debugging.
+
+EOF
+}
+
 # Exit on error.
 set -e
 
@@ -29,24 +71,24 @@ log() { echo " $(date $DATE)" "$@" ; }
 #DATE="+%Y-%m-%dT%H:%M:%S%:z"
 DATE="+%H:%M:%S"
 
-## JVM Arguments
-JVM_ARGS=${JVM_ARGS:--Xmx1200M}
-
-# Classpath set in "tdbloader2"
-if [ -z "$JENA_CP" ]
-then
-    echo "Classpath not provided : set JENA_CP" 1>&2
-    exit 1
-fi
-
-USAGE="Usage: tdbloader2data --loc location datafile ..."
 PKG=org.apache.jena.tdb.store.bulkloader2
 
+# Process Arguments
+LOC=
+KEEP_WORK=0
+
 while [ $# -gt 0 ]
 do
   ARG=$1
   case "$ARG" in
-    --loc|-loc)
+    -k|--keep-work)
+      # Keep work files
+      # This option is actually not used by this script but may be passed in
+      # by the parent tdbloader2 script
+      shift
+      KEEP_WORK=1
+      ;;
+    -l|--loc|-loc)
       # Location space separated
       shift
       LOC="$1"
@@ -57,8 +99,8 @@ do
       LOC=${ARG/-*loc=/}
       shift
       ;;
-    --help)
-      echo $USAGE
+    -h|--help)
+      printUsage
       exit 0
       ;;
     *)
@@ -91,8 +133,16 @@ fi
 if [ ! -d "$LOC" ] ; then echo "Location is not a directory: $LOC" ; exit 1 ; fi
 
 FILES="$@"
-## Stdin?
-KEEPWORKFILES="${KEEPWORKFILES:-}"
+
+## JVM Arguments
+JVM_ARGS=${JVM_ARGS:--Xmx1200M}
+
+# Classpath set in "tdbloader2"
+if [ -z "$JENA_CP" ]
+then
+    echo "Classpath not provided : set JENA_CP" 1>&2
+    exit 1
+fi
 
 # ---- Data loading phase
 log "Data Load Phase"

http://git-wip-us.apache.org/repos/asf/jena/blob/a96b0164/apache-jena/bin/tdbloader2index
----------------------------------------------------------------------
diff --git a/apache-jena/bin/tdbloader2index b/apache-jena/bin/tdbloader2index
index 5624854..2730af1 100755
--- a/apache-jena/bin/tdbloader2index
+++ b/apache-jena/bin/tdbloader2index
@@ -18,6 +18,45 @@
 
 # The environment for this sub-script is setup by "tdbloader2"
 
+function printUsage() {
+  cat << EOF
+tdbloader2index - TDB Bulk Loader - Index Phase
+
+Usage: tdbloader2index --loc <Directory> [Options]
+
+Bulk Loader for TDB which generates the Index files based upon the
+temporary data files generated by tdbloader2data.  This command relies
+on POSIX utilities so will only work on POSIX operating systems.
+
+This command can only be used to create new database. If you wish to
+bulk load to an existing database please use tdbloader instead.
+
+Required options are as follows:
+
+  -l <DatabaseDirectory>
+  --loc <DatabaseDirectory>
+    Sets the location in which the database should be created.
+
+    This location must be a directory and must be empty, if a
+    non-existent path is specified it will be created as a new
+    directory.
+
+Common additional options are as follows:
+
+  -h
+  --help
+    Prints this help summary and exits
+
+Advanced additional options are as follows:
+
+  -k
+  --keep-work
+    Keeps the temporary work files around after they are no longer
+    needed.  May be useful for debugging.
+
+EOF
+}
+
 # Exit on error.
 set -e
 
@@ -30,25 +69,22 @@ TMP=$$
 #DATE="+%Y-%m-%dT%H:%M:%S%:z"
 DATE="+%H:%M:%S"
 
-##--parallel is not always available.
-SORT_ARGS="${SORT_ARGS:---buffer-size=50%}"
-JVM_ARGS=${JVM_ARGS:--Xmx1200M}
-
-# Classpath set in "tdbloader2"
-if [ -z "$JENA_CP" ]
-then
-    echo "Classpath not provided : set JENA_CP" 1>&2
-    exit 1
-fi
-
-USAGE="Usage: tdbloader2index --loc location"
 PKG=org.apache.jena.tdb.store.bulkloader2
 
+# Process Arguments
+LOC=
+KEEP_WORK=0
+
 while [ $# -gt 0 ]
 do
   ARG=$1
   case "$ARG" in
-    --loc|-loc)
+    -k|--keep-work)
+      # Keep work files
+      shift
+      KEEP_WORK=1
+      ;;
+    -l|--loc|-loc)
       # Location space separated
       shift
       LOC="$1"
@@ -59,8 +95,8 @@ do
       LOC=${ARG/-*loc=/}
       shift
       ;;
-    --help)
-      echo $USAGE
+    -h|--help)
+      printUsage
       exit 0
       ;;
     *)
@@ -75,8 +111,6 @@ if [ -z "$LOC" ] ; then echo "No location specified" ; exit 1 ; fi
 if [ ! -e "$LOC" ] ; then echo "Location specified does not exist: $LOC" ; exit 1; fi
 if [ ! -d "$LOC" ] ; then echo "Location is not a directory: $LOC" ; exit 1 ; fi
 
-KEEPWORKFILES="${KEEPWORKFILES:-}"
-
 DATA_TRIPLES="$LOC/data-triples.tmp"
 DATA_QUADS="$LOC/data-quads.tmp"
 
@@ -89,14 +123,29 @@ if [ ! -e "$DATA_QUADS" ]; then
   exit 1
 fi
 
+##--parallel is not always available.
+SORT_ARGS="${SORT_ARGS:---buffer-size=50%}"
+JVM_ARGS=${JVM_ARGS:--Xmx1200M}
+
+# Classpath set in "tdbloader2"
+if [ -z "$JENA_CP" ]
+then
+    echo "Classpath not provided : set JENA_CP" 1>&2
+    exit 1
+fi
+
 # ---- Index intermediates
 ## All files are writtern S P O / G S P O columns per row but in different sort orders.
 log "Index Building Phase"
 
+# Check whether Pipe Viewer is available
+# Needs to temporarily disable exit on error
+set +e
 which pv >/dev/null 2>&1
 HAS_PV=$?
+set -e
 
-process_rows()
+generate_index()
 {
     local KEYS="$1"
     local DATA="$2"
@@ -109,6 +158,8 @@ process_rows()
 	  fi
 
     log "Creating Index $IDX"
+
+    # Sort the input data
     log "  Sort $IDX"
     if [ $HAS_PV = 0 ]; then
       # Use pv (pipe viewer) to monitor sort progress
@@ -120,14 +171,16 @@ process_rows()
       sort $SORT_ARGS -u $KEYS < "$DATA" > $WORK
     fi
     log "  Sort $IDX Completed"
+
+    # Build into an index
     log "  Build $IDX"
     rm -f "$LOC/$IDX.dat"
     rm -f "$LOC/$IDX.idn"
     java -cp "$JENA_CP" "$PKG".CmdIndexBuild "$LOC" "$IDX" "$WORK"
     log "  Build $IDX Completed"
-    # Remove intermediary file.
-    if [ "$KEEPWORKFILES" != "yes" ] 
-    then
+
+    # Remove work file unless keeping
+    if [ $KEEP_WORK = 1 ]; then
 	    rm "$WORK"
     fi
 }
@@ -137,28 +190,27 @@ K2="-k 2,2"
 K3="-k 3,3"
 K4="-k 4,4"
 
-process_rows "$K1 $K2 $K3" "$DATA_TRIPLES" SPO
+generate_index "$K1 $K2 $K3" "$DATA_TRIPLES" SPO
 
-process_rows "$K2 $K3 $K1" "$DATA_TRIPLES" POS
+generate_index "$K2 $K3 $K1" "$DATA_TRIPLES" POS
 
-process_rows "$K3 $K1 $K2" "$DATA_TRIPLES" OSP
+generate_index "$K3 $K1 $K2" "$DATA_TRIPLES" OSP
 
-process_rows "$K1 $K2 $K3 $K4" "$DATA_QUADS" GSPO
+generate_index "$K1 $K2 $K3 $K4" "$DATA_QUADS" GSPO
 
-process_rows "$K1 $K3 $K4 $K2" "$DATA_QUADS" GPOS
+generate_index "$K1 $K3 $K4 $K2" "$DATA_QUADS" GPOS
 
-process_rows "$K1 $K4 $K2 $K3" "$DATA_QUADS" GOSP
+generate_index "$K1 $K4 $K2 $K3" "$DATA_QUADS" GOSP
 
-process_rows "$K2 $K3 $K4 $K1" "$DATA_QUADS" SPOG
+generate_index "$K2 $K3 $K4 $K1" "$DATA_QUADS" SPOG
 
-process_rows "$K3 $K4 $K2 $K1" "$DATA_QUADS" POSG
+generate_index "$K3 $K4 $K2 $K1" "$DATA_QUADS" POSG
 
-process_rows "$K4 $K2 $K3 $K1" "$DATA_QUADS" OSPG
+generate_index "$K4 $K2 $K3 $K1" "$DATA_QUADS" OSPG
 
 log "Index Building Phase Completed"
 
 # ---- Clean up.
-if [ "$KEEPWORKFILES" != "yes" ] 
-then
+if [ $KEEP_WORK = 1 ]; then
     rm -f "$DATA_TRIPLES" "$DATA_QUADS" 
 fi


[4/4] jena git commit: Various further improvements to the scripts (JENA-977)

Posted by rv...@apache.org.
Various further improvements to the scripts (JENA-977)

- Validate sort temporary directory when indexing and WARN if the disk
  it is on is low on space (10% or less free)
- Support --debug and --trace flags in all scripts, add various debug
  output throughout scripts
- Fix a bug with not detecting sort failure when pv is used to monitor
  progress
- Fix a bug in size calculations used for progress monitoring and sort
  failure detection

This commit includes some temporary DEV changes that will be reverted
later


Project: http://git-wip-us.apache.org/repos/asf/jena/repo
Commit: http://git-wip-us.apache.org/repos/asf/jena/commit/7770596b
Tree: http://git-wip-us.apache.org/repos/asf/jena/tree/7770596b
Diff: http://git-wip-us.apache.org/repos/asf/jena/diff/7770596b

Branch: refs/heads/JENA-977
Commit: 7770596bc94613409fe2753240b603ae22a38b57
Parents: a96b016
Author: Rob Vesse <rv...@apache.org>
Authored: Fri Jun 26 16:15:18 2015 +0100
Committer: Rob Vesse <rv...@apache.org>
Committed: Fri Jun 26 16:31:05 2015 +0100

----------------------------------------------------------------------
 apache-jena/bin/tdbloader2      |  59 +++++++++++-----
 apache-jena/bin/tdbloader2data  |  43 ++++++++++--
 apache-jena/bin/tdbloader2index | 126 ++++++++++++++++++++++++++++++-----
 3 files changed, 192 insertions(+), 36 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/jena/blob/7770596b/apache-jena/bin/tdbloader2
----------------------------------------------------------------------
diff --git a/apache-jena/bin/tdbloader2 b/apache-jena/bin/tdbloader2
index 9ff2727..9508031 100755
--- a/apache-jena/bin/tdbloader2
+++ b/apache-jena/bin/tdbloader2
@@ -49,6 +49,10 @@ Common additional options are as follows:
 
 Advanced additional options are as follows:
 
+  -d
+  --debug
+    Enable debug mode, adds extra debug output
+
   -k
   --keep-work
     Keeps the temporary work files around after they are no longer
@@ -65,6 +69,10 @@ Advanced additional options are as follows:
 
     When no phase is specified it defaults to all
 
+  -t
+  --trace
+    Enable trace mode, essentially sets -x within the scripts
+
 EOF
 }
 
@@ -101,13 +109,12 @@ case "$(uname)" in
 esac
 
 export JENA_CP
-#echo $JENA_CP
-if [ -z "$SORT_ARGS" ]
-then
+# echo JENA_CP
+if [ -z "$SORT_ARGS" ]; then
     SORT_ARGS="--buffer-size=50%"
-    if $(sort --parallel=3 < /dev/null 2>/dev/null) 
-    then
-	SORT_ARGS="$SORT_ARGS --parallel=3"
+    sort --parallel=3 < /dev/null 2>/dev/null
+    if [ $? = 0 ]; then
+    	SORT_ARGS="$SORT_ARGS --parallel=3"
     fi
 fi
 export SORT_ARGS
@@ -116,11 +123,23 @@ export SORT_ARGS
 LOC=
 PHASE=
 KEEP_WORK=0
+DEBUG=0
+TRACE=0
 
 while [ $# -gt 0 ]
 do
   ARG=$1
   case "$ARG" in
+    -d|--debug)
+      # Debug Mode
+      shift
+      DEBUG=1
+      ;;
+    -h|--help)
+      # Help
+      printUsage
+      exit 0
+      ;;
     -k|--keep-work)
       # Keep work files
       shift
@@ -143,10 +162,11 @@ do
       PHASE="$1"
       shift
       ;;
-    -h|--help)
-      # Help
-      printUsage
-      exit 0
+    -t|--trace)
+      # Trace mode
+      shift
+      TRACE=1
+      set -x
       ;;
     *)
       # Once we see an unrecognized argument treat as start of files to process
@@ -159,9 +179,15 @@ if [ -z "$PHASE" ]; then
   PHASE="all"
 fi
 COMMON_ARGS=
-if [ $KEEP_WORK = 0 ]; then
+if [ $KEEP_WORK = 1 ]; then
   COMMON_ARGS="--keep-work"
 fi
+if [ $DEBUG = 1 ]; then
+  COMMON_ARGS="$COMMON_ARGS --debug"
+fi
+if [ $TRACE = 1 ]; then
+  COMMON_ARGS="$COMMON_ARGS --trace"
+fi
 
 log() { echo " $(date $DATE)" "$@" ; }
 
@@ -172,16 +198,19 @@ DATE="+%H:%M:%S"
 log "-- TDB Bulk Loader Start"
 TIME1="$(date +%s)"
 
+TOOL_DIR=$JENA_HOME/bin/
+# DEV - Following is just for debugging
+TOOL_DIR=
 case "$PHASE" in
   all)
-    exec "$JENA_HOME/bin/tdbloader2data" $COMMON_ARGS --loc "$LOC" "$@"
-    exec "$JENA_HOME/bin/tdbloader2index" $COMMON_ARGS --loc "$LOC"
+    exec "${TOOL_DIR}tdbloader2data" $COMMON_ARGS --loc "$LOC" "$@"
+    exec "${TOOL_DIR}tdbloader2index" $COMMON_ARGS --loc "$LOC"
     ;;
   data)
-    exec "$JENA_HOME/bin/tdbloader2data" $COMMON_ARGS --loc "$LOC" "$@"
+    exec "${TOOL_DIR}tdbloader2data" $COMMON_ARGS --loc "$LOC" "$@"
     ;;
   index)
-    exec "$JENA_HOME/bin/tdbloader2index" $COMMON_ARGS --loc "$LOC"
+    exec "${TOOL_DIR}tdbloader2index" $COMMON_ARGS --loc "$LOC"
     ;;
   *)
     echo "Unrecognized phase $PHASE" 1>&2

http://git-wip-us.apache.org/repos/asf/jena/blob/7770596b/apache-jena/bin/tdbloader2data
----------------------------------------------------------------------
diff --git a/apache-jena/bin/tdbloader2data b/apache-jena/bin/tdbloader2data
index 5aceb27..efb590a 100755
--- a/apache-jena/bin/tdbloader2data
+++ b/apache-jena/bin/tdbloader2data
@@ -52,35 +52,58 @@ Common additional options are as follows:
 
 Advanced additional options are as follows:
 
+  -d
+  --debug
+    Enable debug mode, adds extra debug output
+
   -k
   --keep-work
     Keeps the temporary work files around after they are no longer
     needed.  May be useful for debugging.
 
+  -t
+  --trace
+    Enable trace mode, essentially sets -x within the scripts
+
 EOF
 }
 
 # Exit on error.
 set -e
 
-# Sort order is ASCII
-export LC_ALL="C"
-
 log() { echo " $(date $DATE)" "$@" ; }
 
+function debug() {
+ if [ $DEBUG = 1 ]; then
+   log "DEBUG" "$@"
+ fi
+}
+
 #DATE="+%Y-%m-%dT%H:%M:%S%:z"
 DATE="+%H:%M:%S"
 
 PKG=org.apache.jena.tdb.store.bulkloader2
+#DEV - Allows use against Jena 2 API
+PKG=com.hp.hpl.jena.tdb.store.bulkloader2
 
 # Process Arguments
 LOC=
 KEEP_WORK=0
+DEBUG=0
 
 while [ $# -gt 0 ]
 do
   ARG=$1
   case "$ARG" in
+    -d|--debug)
+      # Debug Mode
+      shift
+      DEBUG=1
+      ;;
+    -h|--help)
+      printUsage
+      exit 0
+      ;;
     -k|--keep-work)
       # Keep work files
       # This option is actually not used by this script but may be passed in
@@ -99,9 +122,10 @@ do
       LOC=${ARG/-*loc=/}
       shift
       ;;
-    -h|--help)
-      printUsage
-      exit 0
+    -t|--trace)
+      # Trace mode
+      shift
+      set -x
       ;;
     *)
       # Any further arguments are treated as data files
@@ -124,18 +148,23 @@ fi
 
 if [ ! -e "$LOC" ] ; then
   # If non-existent try to create
+  debug "Trying to create new database directory: $LOC"
   mkdir "$LOC"
   if [ $? != 0 ]; then
     echo "Failed to create new directory: $LOC"
     exit 1
   fi
+  debug "New database directory created: $LOC"
 fi
 if [ ! -d "$LOC" ] ; then echo "Location is not a directory: $LOC" ; exit 1 ; fi
 
+# TODO Make LOC absolute
+
 FILES="$@"
 
 ## JVM Arguments
 JVM_ARGS=${JVM_ARGS:--Xmx1200M}
+debug "JVM Arguments are $JVM_ARGS"
 
 # Classpath set in "tdbloader2"
 if [ -z "$JENA_CP" ]
@@ -151,6 +180,8 @@ log "Data Load Phase"
 DATA_TRIPLES="$LOC/data-triples.tmp"
 DATA_QUADS="$LOC/data-quads.tmp"
 
+debug "Data files are $DATA_TRIPLES and $DATA_QUADS"
+
 java $JVM_ARGS -cp "$JENA_CP" "$PKG".CmdNodeTableBuilder \
     "--loc=$LOC" "--triples=$DATA_TRIPLES" "--quads=$DATA_QUADS" $FILES
 

http://git-wip-us.apache.org/repos/asf/jena/blob/7770596b/apache-jena/bin/tdbloader2index
----------------------------------------------------------------------
diff --git a/apache-jena/bin/tdbloader2index b/apache-jena/bin/tdbloader2index
index 2730af1..971b824 100755
--- a/apache-jena/bin/tdbloader2index
+++ b/apache-jena/bin/tdbloader2index
@@ -49,11 +49,18 @@ Common additional options are as follows:
 
 Advanced additional options are as follows:
 
+  -d
+  --debug
+    Enable debug mode, adds extra debug output
+
   -k
   --keep-work
     Keeps the temporary work files around after they are no longer
     needed.  May be useful for debugging.
 
+  -t
+  --trace
+    Enable trace mode, essentially sets -x within the scripts
 EOF
 }
 
@@ -65,20 +72,45 @@ export LC_ALL="C"
 
 log() { echo " $(date $DATE)" "$@" ; }
 
-TMP=$$
+function debug() {
+ if [ $DEBUG = 1 ]; then
+   log "DEBUG" "$@"
+ fi
+}
+
+function warn() {
+  log "WARN" "$@"
+}
+
+function getSize() {
+  ls -l $1 | awk '{print $5}'
+}
+
 #DATE="+%Y-%m-%dT%H:%M:%S%:z"
 DATE="+%H:%M:%S"
 
 PKG=org.apache.jena.tdb.store.bulkloader2
+#DEV - Allows use against Jena 2 API
+PKG=com.hp.hpl.jena.tdb.store.bulkloader2
 
 # Process Arguments
 LOC=
 KEEP_WORK=0
+DEBUG=0
 
 while [ $# -gt 0 ]
 do
   ARG=$1
   case "$ARG" in
+    -d|--debug)
+      # Debug Mode
+      shift
+      DEBUG=1
+      ;;
+    -h|--help)
+      printUsage
+      exit 0
+      ;;
     -k|--keep-work)
       # Keep work files
       shift
@@ -95,9 +127,10 @@ do
       LOC=${ARG/-*loc=/}
       shift
       ;;
-    -h|--help)
-      printUsage
-      exit 0
+    -t|--trace)
+      # Trace mode
+      shift
+      set -x
       ;;
     *)
       # Any further arguments are ignored
@@ -111,6 +144,8 @@ if [ -z "$LOC" ] ; then echo "No location specified" ; exit 1 ; fi
 if [ ! -e "$LOC" ] ; then echo "Location specified does not exist: $LOC" ; exit 1; fi
 if [ ! -d "$LOC" ] ; then echo "Location is not a directory: $LOC" ; exit 1 ; fi
 
+# TODO Make LOC absolute
+
 DATA_TRIPLES="$LOC/data-triples.tmp"
 DATA_QUADS="$LOC/data-quads.tmp"
 
@@ -123,9 +158,12 @@ if [ ! -e "$DATA_QUADS" ]; then
   exit 1
 fi
 
+debug "Data files are $DATA_TRIPLES and $DATA_QUADS"
+
 ##--parallel is not always available.
 SORT_ARGS="${SORT_ARGS:---buffer-size=50%}"
 JVM_ARGS=${JVM_ARGS:--Xmx1200M}
+debug "JVM Arguments are $JVM_ARGS"
 
 # Classpath set in "tdbloader2"
 if [ -z "$JENA_CP" ]
@@ -133,17 +171,57 @@ then
     echo "Classpath not provided : set JENA_CP" 1>&2
     exit 1
 fi
+debug "Jena Classpath is $JENA_CP"
 
 # ---- Index intermediates
 ## All files are writtern S P O / G S P O columns per row but in different sort orders.
 log "Index Building Phase"
 
 # Check whether Pipe Viewer is available
-# Needs to temporarily disable exit on error
+# Needs to temporarily disable exit on error as which produces an error
+# if the given command is not found
 set +e
 which pv >/dev/null 2>&1
 HAS_PV=$?
 set -e
+if [ $HAS_PV = 0 ]; then
+  debug "pv (Pipe Viewer) available on your system so sorts will show progres"
+else
+  debug "No pv (Pipe Viewer) on your system so sorts will show no progress"
+fi
+
+# Check where we are storing temporary sort files
+debug "Sort Arguments: $SORT_ARGS"
+SORT_TEMP_DIR=
+if [[ "$SORT_ARGS" == *"-T "* ]]; then
+  # Specified via -T argument
+  SORT_TEMP_DIR=(${SORT_ARGS/-T /})
+  SORT_TEMP_DIR=${SORT_TEMP_DIR[0]}
+elif [[ "$SORT_ARGS" == *"--temporary-directory="* ]]; then
+  # Specified via --temporary-directory argument
+  SORT_TEMP_DIR=(${SORT_ARGS/--temporary-directory=/})
+  SORT_TEMP_DIR=${SORT_TEMP_DIR[0]}
+else
+  # Using the system temp directory
+  SORT_TEMP_DIR="$TMPDIR"
+fi
+debug "Sort Temp Directory: $SORT_TEMP_DIR"
+
+# Find out how much space is on the sort directory
+SORT_DRIVE_INFO=$(df "$SORT_TEMP_DIR" | tail -n +2)
+SORT_DRIVE_DISK=$(echo $SORT_DRIVE_INFO | awk '{print $1}')
+SORT_DRIVE_FREE_SPACE=$(echo $SORT_DRIVE_INFO | awk '{print $4}')
+SORT_DRIVE_USED=$(echo $SORT_DRIVE_INFO | awk '{print $5}')
+SORT_DRIVE_FREE=${SORT_DRIVE_USED/"%"/}
+SORT_DRIVE_FREE=$((100 - $SORT_DRIVE_FREE))
+debug "Sort Temp Directory ${SORT_TEMP_DIR} is on disk ${SORT_DRIVE_DISK} which has ${SORT_DRIVE_FREE}% free space (${SORT_DRIVE_FREE_SPACE} bytes)"
+
+if [ $SORT_DRIVE_FREE -le 10 ]; then
+  echo
+  warn "Sort Temp Directory ${SORT_TEMP_DIR} is on disk ${SORT_DRIVE_DISK} which only has ${SORT_DRIVE_FREE}% free space (${SORT_DRIVE_FREE_SPACE} bytes) available"
+  warn "This may result in sort failures if the data to be indexed is large"
+  echo
+fi
 
 generate_index()
 {
@@ -152,35 +230,52 @@ generate_index()
     local IDX=$3
     local WORK="$LOC/$IDX-txt"
 
-    if [ ! -s "$DATA" ]
-    then
+    if [ ! -s "$DATA" ]; then
+      debug "Skipping Index $IDX as no relevant data to index"
 	    return
 	  fi
 
     log "Creating Index $IDX"
 
     # Sort the input data
-    log "  Sort $IDX"
+    log "Sort $IDX"
+    debug "Sorting $DATA into work file $WORK"
     if [ $HAS_PV = 0 ]; then
       # Use pv (pipe viewer) to monitor sort progress
       # Note that progress data will only be seen if running in the foreground
-      SIZE=$(du -k "$DATA" | cut -f 1)
+      # To report progress need to know size of input data
+      SIZE=$(getSize "$DATA")
+      debug "Size of data to be sorted is $SIZE bytes"
+
       pv -c -N data < "$DATA" | sort $SORT_ARGS -u $KEYS | pv -c -N sort -s $SIZE > $WORK
+
+      # CAUTION
+      # If sort errors here then the piping through pv will stop us from seeing the error
+      # and we'll continue onwards
+      # Therefore we need to check that the output size is same as input size as this is
+      # the only way to tell if sort suceeded
+      OUTPUT_SIZE=$(getSize "$WORK")
+      debug "Size of sorted data is $OUTPUT_SIZE bytes"
+      if [ $SIZE != $OUTPUT_SIZE ]; then
+        log "Aborting due to sort error"
+        exit 1
+      fi
     else
       # Use sort without any progress monitoring
       sort $SORT_ARGS -u $KEYS < "$DATA" > $WORK
     fi
-    log "  Sort $IDX Completed"
+    log "Sort $IDX Completed"
 
     # Build into an index
-    log "  Build $IDX"
+    log "Build $IDX"
     rm -f "$LOC/$IDX.dat"
     rm -f "$LOC/$IDX.idn"
     java -cp "$JENA_CP" "$PKG".CmdIndexBuild "$LOC" "$IDX" "$WORK"
-    log "  Build $IDX Completed"
+    log "Build $IDX Completed"
 
     # Remove work file unless keeping
-    if [ $KEEP_WORK = 1 ]; then
+    if [ $KEEP_WORK = 0 ]; then
+      debug "Cleaning up work file $WORK"
 	    rm "$WORK"
     fi
 }
@@ -211,6 +306,7 @@ generate_index "$K4 $K2 $K3 $K1" "$DATA_QUADS" OSPG
 log "Index Building Phase Completed"
 
 # ---- Clean up.
-if [ $KEEP_WORK = 1 ]; then
-    rm -f "$DATA_TRIPLES" "$DATA_QUADS" 
+if [ $KEEP_WORK = 0 ]; then
+  debug "Cleaning up data files $DATA_TRIPLES and $DATA_QUADS"
+  rm -f "$DATA_TRIPLES" "$DATA_QUADS"
 fi


[2/4] jena git commit: Further tweak new tdbloader2 scripts (JENA-977)

Posted by rv...@apache.org.
Further tweak new tdbloader2 scripts (JENA-977)

- Add proper usage to tdbloader2
- Check for temporary data files needed for index phase in
  tdbloader2index


Project: http://git-wip-us.apache.org/repos/asf/jena/repo
Commit: http://git-wip-us.apache.org/repos/asf/jena/commit/7b61a144
Tree: http://git-wip-us.apache.org/repos/asf/jena/tree/7b61a144
Diff: http://git-wip-us.apache.org/repos/asf/jena/diff/7b61a144

Branch: refs/heads/JENA-977
Commit: 7b61a144854d81acbd180b5debfd5c8638d2af57
Parents: d92e336
Author: Rob Vesse <rv...@apache.org>
Authored: Thu Jun 25 17:04:36 2015 +0100
Committer: Rob Vesse <rv...@apache.org>
Committed: Fri Jun 26 16:30:45 2015 +0100

----------------------------------------------------------------------
 apache-jena/bin/tdbloader2      | 35 ++++++++++++++++++++++++++++++++---
 apache-jena/bin/tdbloader2index |  9 +++++++++
 2 files changed, 41 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/jena/blob/7b61a144/apache-jena/bin/tdbloader2
----------------------------------------------------------------------
diff --git a/apache-jena/bin/tdbloader2 b/apache-jena/bin/tdbloader2
index 37cc874..34ee029 100755
--- a/apache-jena/bin/tdbloader2
+++ b/apache-jena/bin/tdbloader2
@@ -15,6 +15,30 @@
 ## See the License for the specific language governing permissions and
 ## limitations under the License.
 
+function printUsage() {
+  cat << EOF
+Usage: tdbloader2 <Options> <Data>
+
+Options are as follows:
+
+  --help
+    Prints this help summary and exits
+
+  --loc <DatabaseDirectory>
+    Sets the location in which the database should be created
+
+  --phase <Phase>
+    Sets the phase of the build to run, supported values are:
+
+      all    Full bulk load
+      data   Data phase only
+      index  Index phase only, requires the data phase to previously have been run
+
+    When not specified defaults to all
+
+EOF
+}
+
 # If JENA_HOME is empty
 if [ -z "$JENA_HOME" ]
 	then
@@ -48,7 +72,7 @@ case "$(uname)" in
 esac
 
 export JENA_CP
-echo $JENA_CP
+#echo $JENA_CP
 if [ -z "$SORT_ARGS" ]
 then
     SORT_ARGS="--buffer-size=50%"
@@ -84,6 +108,11 @@ do
       PHASE="$1"
       shift
       ;;
+    --help)
+      # Help
+      printUsage
+      exit 0
+      ;;
     *)
       # Once we see an unrecognized argument treat as start of files to process
       break
@@ -95,8 +124,8 @@ if [ -z "$PHASE" ]; then
   PHASE="all"
 fi
 
-echo "Location is '$LOC'"
-echo "Phase is '$PHASE'"
+#echo "Location is '$LOC'"
+#echo "Phase is '$PHASE'"
 
 log() { echo " $(date $DATE)" "$@" ; }
 

http://git-wip-us.apache.org/repos/asf/jena/blob/7b61a144/apache-jena/bin/tdbloader2index
----------------------------------------------------------------------
diff --git a/apache-jena/bin/tdbloader2index b/apache-jena/bin/tdbloader2index
index 372aa5c..5624854 100755
--- a/apache-jena/bin/tdbloader2index
+++ b/apache-jena/bin/tdbloader2index
@@ -80,6 +80,15 @@ KEEPWORKFILES="${KEEPWORKFILES:-}"
 DATA_TRIPLES="$LOC/data-triples.tmp"
 DATA_QUADS="$LOC/data-quads.tmp"
 
+if [ ! -e "$DATA_TRIPLES" ] ; then
+  echo "No triples data file found in location, please run the tdbloader2data script first"
+  exit 1
+fi
+if [ ! -e "$DATA_QUADS" ]; then
+  echo "No quads data file found in location, please run the tdbloader2data script first"
+  exit 1
+fi
+
 # ---- Index intermediates
 ## All files are writtern S P O / G S P O columns per row but in different sort orders.
 log "Index Building Phase"