You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@jena.apache.org by rv...@apache.org on 2015/07/07 14:53:08 UTC

[09/18] jena git commit: Finish up first pass of work on tdbloader2 script refactoring (JENA-977)

Finish up first pass of work on tdbloader2 script refactoring (JENA-977)

- Add options for setting the JVM and sort arguments that do not rely on
  environment variables.  NB - For backwards compatibility the existing
  environment variables are still honoured if the new command line
  options are not used
- Improve some error messages
- Explicitly support -- for separating data files from options for cases
  where file names may be confused


Project: http://git-wip-us.apache.org/repos/asf/jena/repo
Commit: http://git-wip-us.apache.org/repos/asf/jena/commit/d4a0bc50
Tree: http://git-wip-us.apache.org/repos/asf/jena/tree/d4a0bc50
Diff: http://git-wip-us.apache.org/repos/asf/jena/diff/d4a0bc50

Branch: refs/heads/master
Commit: d4a0bc50a6d82ab5bbb43ab90e65216e5b165621
Parents: cc4a80a
Author: Rob Vesse <rv...@apache.org>
Authored: Tue Jun 30 15:04:50 2015 +0100
Committer: Rob Vesse <rv...@apache.org>
Committed: Tue Jun 30 15:04:50 2015 +0100

----------------------------------------------------------------------
 apache-jena/bin/tdbloader2       | 76 ++++++++++++++++++++++++++++-------
 apache-jena/bin/tdbloader2common |  4 ++
 apache-jena/bin/tdbloader2data   | 40 +++++++++++++++---
 apache-jena/bin/tdbloader2index  | 72 ++++++++++++++++++++++++++++-----
 4 files changed, 162 insertions(+), 30 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/jena/blob/d4a0bc50/apache-jena/bin/tdbloader2
----------------------------------------------------------------------
diff --git a/apache-jena/bin/tdbloader2 b/apache-jena/bin/tdbloader2
index d0d906c..e598aeb 100755
--- a/apache-jena/bin/tdbloader2
+++ b/apache-jena/bin/tdbloader2
@@ -53,6 +53,20 @@ Advanced additional options are as follows:
   --debug
     Enable debug mode, adds extra debug output
 
+  -j <JvmArgs>
+  --jvm-args <JvmArgs>
+    Sets the arguments that should be passed to the JVM for the
+    JVM based portions of the build.
+
+    Generally it is best to not change these unless you have been
+    specifically advised to.  The scripts will use appropriate
+    defaults if this is not specified.
+
+    In particular be careful increasing the heap size since many
+    parts of TDB actually use memory mapped files that live
+    outside the heap so if the heap is too large the heap may
+    conflict with the memory mapped files for memory space.
+
   -k
   --keep-work
     Keeps the temporary work files around after they are no longer
@@ -69,6 +83,14 @@ Advanced additional options are as follows:
 
     When no phase is specified it defaults to all
 
+  -s <SortArgs>
+  --sort-args <SortArgs>
+    Sets the arguments that should be passed to sort for the sort
+    based portions of the build.
+
+    Generally it is best not to change these as the scripts will
+    use appropriate defaults for your system.
+
   -t
   --trace
     Enable trace mode, essentially sets -x within the scripts
@@ -111,15 +133,7 @@ case "$(uname)" in
 esac
 
 export JENA_CP
-# echo JENA_CP
-if [ -z "$SORT_ARGS" ]; then
-    SORT_ARGS="--buffer-size=50%"
-    sort --parallel=3 < /dev/null 2>/dev/null
-    if [ $? = 0 ]; then
-    	SORT_ARGS="$SORT_ARGS --parallel=3"
-    fi
-fi
-export SORT_ARGS
+
 
 # Process arguments
 LOC=
@@ -127,6 +141,8 @@ PHASE=
 KEEP_WORK=0
 DEBUG=0
 TRACE=0
+JVM_ARGS=
+SORT_ARGS=
 
 while [ $# -gt 0 ]
 do
@@ -142,6 +158,12 @@ do
       printUsage
       exit 0
       ;;
+    -j|--jvm-args)
+      # JVM Arguments
+      shift
+      JVM_ARGS="$1"
+      shift
+      ;;
     -k|--keep-work)
       # Keep work files
       shift
@@ -164,14 +186,30 @@ do
       PHASE="$1"
       shift
       ;;
+    -s|--sort-args)
+      # Sort arguments
+      shift
+      SORT_ARGS=$1
+      shift
+      ;;
     -t|--trace)
       # Trace mode
       shift
       TRACE=1
       set -x
       ;;
+    --)
+      # Arguments separator
+      # All further arguments are treated as data files
+      shift
+      break
+      ;;
+    -*)
+      # Looks like an option but not known
+      abort 1 "Unrecognized option $ARG, if this was meant to be a data file separate options from data files with --"
+      ;;
     *)
-      # Once we see an unrecognized argument treat as start of files to process
+      # Once we see an unrecognized argument that doesn't look like an option treat as start of files to process
       break
       ;;
   esac
@@ -180,7 +218,11 @@ done
 if [ -z "$PHASE" ]; then
   PHASE="all"
 fi
+
+# Prepare arguments to pass to children
 COMMON_ARGS=
+DATA_ARGS=
+INDEX_ARGS=
 if [ $KEEP_WORK = 1 ]; then
   COMMON_ARGS="--keep-work"
 fi
@@ -190,6 +232,12 @@ fi
 if [ $TRACE = 1 ]; then
   COMMON_ARGS="$COMMON_ARGS --trace"
 fi
+if [ -n "$JVM_ARGS" ]; then
+  COMMON_ARGS="$COMMON_ARGS --jvm-args $JVM_ARGS"
+fi
+if [ -n "$SORT_ARGS" ]; then
+  INDEX_ARGS="--sort-args $SORT_ARGS"
+fi
 
 # ---- Start
 info "-- TDB Bulk Loader Start"
@@ -200,14 +248,14 @@ case "$PHASE" in
   all)
     # All Phases
     # Data Phase
-    "${TOOL_DIR}/tdbloader2data" $COMMON_ARGS --loc "$LOC" "$@"
+    "${TOOL_DIR}/tdbloader2data" $COMMON_ARGS $DATA_ARGS --loc "$LOC" -- "$@"
     RET=$?
     if [ $RET -ne 0 ]; then
       abort $RET "Failed during data phase"
     fi
 
     # Index Phase
-    "${TOOL_DIR}/tdbloader2index" $COMMON_ARGS --loc "$LOC"
+    "${TOOL_DIR}/tdbloader2index" $COMMON_ARGS $INDEX_ARGS --loc "$LOC"
     RET=$?
     if [ $RET -ne 0 ]; then
       abort $RET "Failed during data phase"
@@ -216,7 +264,7 @@ case "$PHASE" in
 
   data)
     # Data Phase
-    "${TOOL_DIR}/tdbloader2data" $COMMON_ARGS --loc "$LOC" "$@"
+    "${TOOL_DIR}/tdbloader2data" $COMMON_ARGS $DATA_ARGS --loc "$LOC" -- "$@"
     RET=$?
     if [ $RET -ne 0 ]; then
       abort $RET "Failed during data phase"
@@ -225,7 +273,7 @@ case "$PHASE" in
 
   index)
     # Index Phase
-    "${TOOL_DIR}/tdbloader2index" $COMMON_ARGS --loc "$LOC"
+    "${TOOL_DIR}/tdbloader2index" $COMMON_ARGS $INDEX_ARGS --loc "$LOC"
     RET=$?
     if [ $RET -ne 0 ]; then
       abort $RET "Failed during index phase"

http://git-wip-us.apache.org/repos/asf/jena/blob/d4a0bc50/apache-jena/bin/tdbloader2common
----------------------------------------------------------------------
diff --git a/apache-jena/bin/tdbloader2common b/apache-jena/bin/tdbloader2common
index 2830545..2c116ad 100644
--- a/apache-jena/bin/tdbloader2common
+++ b/apache-jena/bin/tdbloader2common
@@ -80,7 +80,10 @@ function getDriveInfo() {
 }
 
 function getFreeMem() {
+  # May be called from a script where exit on error is set
+  # in which case disable for the life of this function
   set +e
+
   local FREE_MEM=-1
   case "$OSTYPE" in
     darwin*)
@@ -98,6 +101,7 @@ function getFreeMem() {
       fi
       ;;
   esac
+
   set -e
 
   echo "$FREE_MEM"

http://git-wip-us.apache.org/repos/asf/jena/blob/d4a0bc50/apache-jena/bin/tdbloader2data
----------------------------------------------------------------------
diff --git a/apache-jena/bin/tdbloader2data b/apache-jena/bin/tdbloader2data
index 2c48a50..f942e20 100755
--- a/apache-jena/bin/tdbloader2data
+++ b/apache-jena/bin/tdbloader2data
@@ -63,6 +63,20 @@ Advanced additional options are as follows:
   --debug
     Enable debug mode, adds extra debug output
 
+  -j <JvmArgs>
+  --jvm-args <JvmArgs>
+    Sets the arguments that should be passed to the JVM for the
+    JVM based portions of the build.
+
+    Generally it is best to not change these unless you have been
+    specifically advised to.  The scripts will use appropriate
+    defaults if this is not specified.
+
+    In particular be careful increasing the heap size since many
+    parts of TDB actually use memory mapped files that live
+    outside the heap so if the heap is too large the heap may
+    conflict with the memory mapped files for memory space.
+
   -k
   --keep-work
     Keeps the temporary work files around after they are no longer
@@ -96,6 +110,12 @@ do
       printUsage
       exit 0
       ;;
+    -j|--jvm-args)
+      # JVM Arguments
+      shift
+      JVM_ARGS="$1"
+      shift
+      ;;
     -k|--keep-work)
       # Keep work files
       # This option is actually not used by this script but may be passed in
@@ -119,6 +139,16 @@ do
       shift
       set -x
       ;;
+    --)
+      # Arguments separator
+      # All further arguments are treated as data files
+      shift
+      break
+      ;;
+    -*)
+      # Unrecognized
+      abort 1 "Unrecognized option $ARG, if this was meant to be a data file separate options from data files with --"
+      ;;
     *)
       # Any further arguments are treated as data files
       break
@@ -128,10 +158,10 @@ done
 
 # Verify arguments
 if [ -z "$LOC" ]; then
-  abort 1 "No location specified"
+  abort 1 "Required database location not specified"
 fi
 if [ $# = 0 ]; then
-  abort 1 "No data files specified"
+  abort 1 "No data files specified, one/more data files must be specified"
 fi
 
 # Make LOC absolute
@@ -145,7 +175,7 @@ fi
 # Skip a possible configuration file
 if test -n "$(find "$LOC" -maxdepth 1 -type f ! -name 'this.*' -print -quit)"
 then 
-    abort 1 "Location is not empty: $LOC"
+    abort 1 "Database location is not empty: $LOC"
 fi
 
 if [ ! -e "$LOC" ] ; then
@@ -158,12 +188,12 @@ if [ ! -e "$LOC" ] ; then
   debug "New database directory created: $LOC"
 fi
 if [ ! -d "$LOC" ]; then
-  abort 1 "Location is not a directory: $LOC"
+  abort 1 "Database location is not a directory: $LOC"
 fi
 
 FILES="$@"
 
-## JVM Arguments
+## Prepare JVM Arguments
 JVM_ARGS=${JVM_ARGS:--Xmx1200M}
 debug "JVM Arguments are $JVM_ARGS"
 

http://git-wip-us.apache.org/repos/asf/jena/blob/d4a0bc50/apache-jena/bin/tdbloader2index
----------------------------------------------------------------------
diff --git a/apache-jena/bin/tdbloader2index b/apache-jena/bin/tdbloader2index
index 15a5832..b997b39 100755
--- a/apache-jena/bin/tdbloader2index
+++ b/apache-jena/bin/tdbloader2index
@@ -60,11 +60,33 @@ Advanced additional options are as follows:
   --debug
     Enable debug mode, adds extra debug output
 
+  -j <JvmArgs>
+  --jvm-args <JvmArgs>
+    Sets the arguments that should be passed to the JVM for the
+    JVM based portions of the build.
+
+    Generally it is best to not change these unless you have been
+    specifically advised to.  The scripts will use appropriate
+    defaults if this is not specified.
+
+    In particular be careful increasing the heap size since many
+    parts of TDB actually use memory mapped files that live
+    outside the heap so if the heap is too large the heap may
+    conflict with the memory mapped files for memory space.
+
   -k
   --keep-work
     Keeps the temporary work files around after they are no longer
     needed.  May be useful for debugging.
 
+  -s <SortArgs>
+  --sort-args <SortArgs>
+    Sets the arguments that should be passed to sort for the sort
+    based portions of the build.
+
+    Generally it is best not to change these as the scripts will
+    use appropriate defaults for your system.
+
   -t
   --trace
     Enable trace mode, essentially sets -x within the scripts
@@ -81,6 +103,8 @@ export LC_ALL="C"
 LOC=
 KEEP_WORK=0
 DEBUG=0
+JVM_ARGS=
+SORT_ARGS=
 
 while [ $# -gt 0 ]
 do
@@ -95,6 +119,12 @@ do
       printUsage
       exit 0
       ;;
+    -j|--jvm-args)
+      # JVM Arguments
+      shift
+      JVM_ARGS="$1"
+      shift
+      ;;
     -k|--keep-work)
       # Keep work files
       shift
@@ -111,21 +141,27 @@ do
       LOC=${ARG/-*loc=/}
       shift
       ;;
+    -s|--sort-args)
+      # Sort arguments
+      shift
+      SORT_ARGS=$1
+      shift
+      ;;
     -t|--trace)
       # Trace mode
       shift
       set -x
       ;;
     *)
-      # Any further arguments are ignored
-      break
+      # Additional options are not supported
+      abort 1 "Unrecognized option $ARG"
       ;;
   esac
 done
 
 # Verify arguments
 if [ -z "$LOC" ]; then
-  abort 1 "No location specified"
+  abort 1 "Required database location not specified"
 fi
 
 # Make LOC absolute
@@ -137,26 +173,40 @@ fi
 
 # Check location
 if [ ! -e "$LOC" ]; then
-  abort 1 "Location specified does not exist: $LOC"
+  abort 1 "Database location specified does not exist: $LOC"
 fi
 if [ ! -d "$LOC" ]; then
-  abort 1 "Location is not a directory: $LOC"
+  abort 1 "Database location is not a directory: $LOC"
 fi
 
+# Locate and check data text files
 DATA_TRIPLES="$LOC/data-triples.tmp"
 DATA_QUADS="$LOC/data-quads.tmp"
 
 if [ ! -e "$DATA_TRIPLES" ]; then
-  abort 1 "No triples text file found in location, please run the tdbloader2data script first"
+  abort 1 "No triples text file found in database location, please run the tdbloader2data script first"
 fi
 if [ ! -e "$DATA_QUADS" ]; then
-  abort 1 "No quads text file found in location, please run the tdbloader2data script first"
+  abort 1 "No quads text file found in database location, please run the tdbloader2data script first"
 fi
 
 debug "Data text files are $DATA_TRIPLES and $DATA_QUADS"
 
-##--parallel is not always available.
-SORT_ARGS="${SORT_ARGS:---buffer-size=50%}"
+# Prepare sort arguments
+if [ -z "$SORT_ARGS" ]; then
+    SORT_ARGS="--buffer-size=50%"
+
+    ##--parallel is not always available.
+    # Temporarily disable exit on error while we check for --parallel support
+    set +e
+    sort --parallel=3 < /dev/null 2>/dev/null
+    if [ $? = 0 ]; then
+    	SORT_ARGS="$SORT_ARGS --parallel=3"
+    fi
+    set -e
+fi
+
+# Prepare JVM arguments
 JVM_ARGS=${JVM_ARGS:--Xmx1200M}
 debug "JVM Arguments are $JVM_ARGS"
 
@@ -201,7 +251,7 @@ fi
 SORT_TEMP_DIR=$(makeAbsolute "$SORT_TEMP_DIR")
 debug "Sort Temp Directory: $SORT_TEMP_DIR"
 SORT_DRIVE_INFO=($(getDriveInfo "${SORT_TEMP_DIR}"))
-debug "Sort Temp Directory ${DIR} is on disk ${SORT_DRIVE_INFO[0]} which has ${SORT_DRIVE_INFO[2]}% free space (${SORT_DRIVE_INFO[3]} bytes)"
+debug "Sort Temp Directory is on disk ${SORT_DRIVE_INFO[0]} which has ${SORT_DRIVE_INFO[2]}% free space (${SORT_DRIVE_INFO[3]} bytes)"
 
 if [ "${SORT_DRIVE_INFO[2]}" -le 10 ]; then
   warn "-----"
@@ -288,7 +338,7 @@ generate_index()
     info "Build $IDX"
     rm -f "$LOC/$IDX.dat"
     rm -f "$LOC/$IDX.idn"
-    java -cp "$JENA_CP" "$PKG".CmdIndexBuild "$LOC" "$IDX" "$WORK"
+    java $JVM_ARGS -cp "$JENA_CP" "$PKG".CmdIndexBuild "$LOC" "$IDX" "$WORK"
     info "Build $IDX Completed"
 
     # Remove work file unless keeping