You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@jena.apache.org by an...@apache.org on 2013/02/27 22:51:33 UTC
svn commit: r1450984 - in /jena/trunk/apache-jena/bin: tdbloader2
tdbloader2worker
Author: andy
Date: Wed Feb 27 21:51:33 2013
New Revision: 1450984
URL: http://svn.apache.org/r1450984
Log:
Restore the production scripts for tdbloader2
Modified:
jena/trunk/apache-jena/bin/tdbloader2
jena/trunk/apache-jena/bin/tdbloader2worker
Modified: jena/trunk/apache-jena/bin/tdbloader2
URL: http://svn.apache.org/viewvc/jena/trunk/apache-jena/bin/tdbloader2?rev=1450984&r1=1450983&r2=1450984&view=diff
==============================================================================
--- jena/trunk/apache-jena/bin/tdbloader2 (original)
+++ jena/trunk/apache-jena/bin/tdbloader2 Wed Feb 27 21:51:33 2013
@@ -1,39 +1,41 @@
-#!/bin/bash
+#!/bin/sh
+## Licensed under the terms of http://www.apache.org/licenses/LICENSE-2.0
-## Licensed to the Apache Software Foundation (ASF) under one
-## or more contributor license agreements. See the NOTICE file
-## distributed with this work for additional information
-## regarding copyright ownership. The ASF licenses this file
-## to you under the Apache License, Version 2.0 (the
-## "License"); you may not use this file except in compliance
-## with the License. You may obtain a copy of the License at
-##
-## http://www.apache.org/licenses/LICENSE-2.0
-##
-## Unless required by applicable law or agreed to in writing, software
-## distributed under the License is distributed on an "AS IS" BASIS,
-## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-## See the License for the specific language governing permissions and
-## limitations under the License.
-
-if [ "$TDBROOT" = "" ]
- then
- echo "TDBROOT is not set" 1>&2
- exit 1
-fi
-
-INIT="$TDBROOT/bin/tdb_init"
+# If JENA_HOME is empty
+if [ -z "$JENA_HOME" ]
+ then
+ SCRIPT="$0"
+ # Catch common issue: script has been symlinked
+ if [ -L "$SCRIPT" ]
+ then
+ SCRIPT="$(readlink "$0")"
+ # If link is relative
+ case "$SCRIPT" in
+ /*) ;; # fine
+ *) SCRIPT=$( dirname "$0" )/$SCRIPT;; # fix
+ esac
+ fi
-if [ ! -r "$INIT" ]
-then
- echo "Script $INIT (\$TDBROOT/bin/tdb_init) does not exist or is not readable"
- exit 1
+ # Work out root from script location
+ JENA_HOME="$( cd "$( dirname "$SCRIPT" )/.." && pwd )"
fi
-. "$INIT"
+# ---- Setup
+JVM_ARGS=${JVM_ARGS:--Xmx1024M}
+# Expand JENA_HOME but literal *
+JENA_CP="$JENA_HOME"'/lib/*'
+SOCKS=
+LOGGING="-Dlog4j.configuration=file:$JENA_HOME/jena-log4j.properties"
+
+# Platform specific fixup
+#??On CYGWIN convert path and end with a ';'
+case "$(uname)" in
+ CYGWIN*) JENA_CP="$(cygpath -wp "$JENA_CP");";;
+esac
+
+export JENA_CP
-#echo "$TDB_CP"
-export JENA_CP="$TDB_CP"
## And --parallel=3 if available.
export SORT_ARGS="${SORT_ARGS:---buffer-size=50%}"
-exec "$TDBROOT/bin/tdbloader2worker" "$@"
+
+exec "$JENA_HOME/bin/tdbloader2worker" "$@"
Modified: jena/trunk/apache-jena/bin/tdbloader2worker
URL: http://svn.apache.org/viewvc/jena/trunk/apache-jena/bin/tdbloader2worker?rev=1450984&r1=1450983&r2=1450984&view=diff
==============================================================================
--- jena/trunk/apache-jena/bin/tdbloader2worker (original)
+++ jena/trunk/apache-jena/bin/tdbloader2worker Wed Feb 27 21:51:33 2013
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
## Licensed to the Apache Software Foundation (ASF) under one
## or more contributor license agreements. See the NOTICE file
@@ -16,24 +16,131 @@
## See the License for the specific language governing permissions and
## limitations under the License.
-if [ "$TDBROOT" = "" ]
- then
- echo "TDBROOT is not set" 1>&2
+# The environment for this sub-script is setup by "tdbloader2"
+
+# Exit on error.
+set -e
+
+# Sort order is ASCII
+export LC_LOCALE="C"
+
+log() { echo " $(date $DATE)" "$@" ; }
+
+TMP=$$
+#DATE="+%Y-%m-%dT%H:%M:%S%:z"
+DATE="+%H:%M:%S"
+
+##--parallel is not always available.
+SORT_ARGS="${SORT_ARGS:---buffer-size=50%}"
+JVM_ARGS=${JVM_ARGS:--Xmx1200M}
+
+# Classpath set in "tdbloader2"
+if [ -z "$JENA_CP" ]
+then
+ echo "Classpath not provided : set JENA_CP" 1>&2
exit 1
fi
-INIT="$TDBROOT/bin/tdb_init"
+USAGE="Usage: $(basename $0) --loc location datafile ..."
+PKG=com.hp.hpl.jena.tdb.store.bulkloader2
-if [ ! -r "$INIT" ]
-then
- echo "Script $INIT (\$TDBROOT/bin/tdb_init) does not exist or is not readable"
- exit 1
+if [ "$#" -lt 2 ] ; then echo "$USAGE" 1>&2 ; exit 1 ; fi
+
+## Process --loc. Yuk.
+ARG1="$1"
+shift
+if [ "$ARG1" = "-loc" -o "$ARG1" = "--loc" ]
+then
+ LOC="$1"
+ shift
+else
+ LOC="${ARG1/-*loc=/}"
+ if [ "$ARG1" = "$LOC" ] ; then echo $USAGE 1>&2 ; exit 1 ; fi
fi
-. "$INIT"
+if [ ! -e "$LOC" ] ; then mkdir "$LOC" ; fi
+if [ ! -d "$LOC" ] ; then echo "Not a directory: $LOC" ; exit 1 ; fi
+
+FILES="$@"
+## Stdin?
+KEEPWORKFILES="${KEEPWORKFILES:-}"
+# ---- Start
+log "-- TDB Bulk Loader Start"
+TIME1="$(date +%s)"
+
+# ---- Data loading phase
+log "Data phase"
+# Produce nodes file and triples/quads text file.
+
+DATA_TRIPLES="$LOC/data-triples.$TMP"
+DATA_QUADS="$LOC/data-quads.$TMP"
+
+java $JVM_ARGS -cp "$JENA_CP" "$PKG".CmdNodeTableBuilder \
+ "--loc=$LOC" "--triples=$DATA_TRIPLES" "--quads=$DATA_QUADS" $FILES
+
+# ---- Index intermediates
+## All files are writtern S P O / G S P O columns per row but in different sort orders.
+log "Index phase"
+
+process_rows()
+{
+ local KEYS="$1"
+ local DATA="$2"
+ local IDX=$3
+ local WORK="$LOC/$IDX-txt"
+
+ if [ ! -s "$DATA" ]
+ then
+ return
+ fi
+
+ log "Index $IDX"
+ sort $SORT_ARGS -u $KEYS < "$DATA" > $WORK
+ log "Build $IDX"
+ rm -f "$LOC/$IDX.dat"
+ rm -f "$LOC/$IDX.idn"
+ java -cp "$JENA_CP" "$PKG".CmdIndexBuild "$LOC" "$IDX" "$WORK"
+ # Remove intermediary file.
+ if [ "$KEEPWORKFILES" != "yes" ]
+ then
+ rm "$WORK"
+ fi
+}
+
+K1="-k 1,1"
+K2="-k 2,2"
+K3="-k 3,3"
+K4="-k 4,4"
+
+process_rows "$K1 $K2 $K3" "$DATA_TRIPLES" SPO
+
+process_rows "$K2 $K3 $K1" "$DATA_TRIPLES" POS
+
+process_rows "$K3 $K1 $K2" "$DATA_TRIPLES" OSP
+
+process_rows "$K1 $K2 $K3 $K4" "$DATA_QUADS" GSPO
+
+process_rows "$K1 $K3 $K4 $K2" "$DATA_QUADS" GPOS
+
+process_rows "$K1 $K4 $K2 $K3" "$DATA_QUADS" GOSP
+
+process_rows "$K2 $K3 $K4 $K1" "$DATA_QUADS" SPOG
+
+process_rows "$K3 $K4 $K2 $K1" "$DATA_QUADS" POSG
+
+process_rows "$K4 $K2 $K3 $K1" "$DATA_QUADS" OSPG
+
+log "Index phase end"
+TIME2="$(date +%s)"
+
+# ---- Clean up.
+
+if [ "$KEEPWORKFILES" != "yes" ]
+then
+ rm -f "$DATA_TRIPLES" "$DATA_QUADS"
+fi
-#echo "$TDB_CP"
-export JENA_CP="$TDB_CP"
-## And --parallel=3 if available.
-export SORT_ARGS="${SORT_ARGS:---buffer-size=50%}"
-exec "$TDBROOT/bin/tdbloader2worker" "$@"
+# ---- End
+log "-- TDB Bulk Loader Finish"
+ELAPSED=$(($TIME2-$TIME1))
+log "-- $ELAPSED seconds"