#!/bin/bash # Exit on error. set -e # TODO function log { echo " $(date $DATE)" "$@" ; } TMP=$$ #DATE="+%Y-%m-%dT%H:%M:%S%:z" DATE="+%H:%M:%S" CP="$(make_classpath $TDBROOT)" USAGE="Usage: $(basename $0) --loc location datafile ..." PKG=com.hp.hpl.jena.tdb.store.bulkloader2 if [ "$#" -lt 2 ] ; then echo "$USAGE" 1>&2 ; exit 1 ; fi ## Process --loc. Yuk. ARG1="$1" shift if [ "$ARG1" = "-loc" -o "$ARG1" = "--loc" ] then LOC="$1" shift else LOC="${ARG1/-*loc=/}" if [ "$ARG1" = "$LOC" ] ; then echo $USAGE 1>&2 ; exit 1 ; fi fi if [ ! -e "$LOC" ] ; then mkdir "$LOC" ; fi if [ ! -d "$LOC" ] ; then echo "Not a directory: $LOC" ; exit 1 ; fi FILES="$@" ## Stdin? # ---- Start log "-- TDB Bulk Loader Start" TIME1="$(date +%s)" # ---- Data loading phase log "Data phase" # Produce nodes file and triples/quads text file. DATA_TRIPLES="$LOC/data-triples.$TMP" DATA_QUADS="$LOC/data-quads.$TMP" java -cp "$CP" -server "$PKG".CmdNodeTableBuilder \ "--loc=$LOC" "--triples=$DATA_TRIPLES" "--quads=$DATA_QUADS" $FILES # ---- Index intermediates ## All files are writtern S P O but in different sort orders. log "Index phase" # Sort to SPO etc ## I1="SPO POS OSP" ## I2="GSPO GPOS GOSP SPOG POSG OSPG" ## I3="node2id prefixIdx prefix2id" function process_rows { local KEYS="$1" local DATA="$2" local IDX=$3 local WORK="$LOC/$IDX-txt" if [ ! -s "$DATA" ] then return fi log "Index $IDX" sort $KEYS < "$DATA" > $WORK log "Build $IDX" rm -f "$LOC/$IDX.dat" rm -f "$LOC/$IDX.idn" java -cp "$CP" -server "$PKG".CmdIndexBuild "$LOC" "$IDX" "$WORK" # Remove intermediary file. rm "$WORK" } #Better - sort, do index, delete intermediate file, repeat process_rows "-k1 -k2 -k3" "$DATA_TRIPLES" SPO process_rows "-k2 -k3 -k1" "$DATA_TRIPLES" POS process_rows "-k3 -k1 -k2" "$DATA_TRIPLES" OSP process_rows "-k1 -k2 -k3 -k4" "$DATA_QUADS" GSPO process_rows "-k1 -k3 -k4 -k2" "$DATA_QUADS" GPOS process_rows "-k1 -k4 -k2 -k3" "$DATA_QUADS" GOSP process_rows "-k2 -k3 -k4 -k1" "$DATA_QUADS" SPOG process_rows "-k3 -k4 -k2 -k1" "$DATA_QUADS" POSG process_rows "-k4 -k2 -k3 -k1" "$DATA_QUADS" OSPG log "Index phase end" TIME2="$(date +%s)" ## # ---- Index loading phase ## ## for IDX in $I1 ## do ## log "Build index $IDX" ## ## The B+Tree builder creates a tree from no file files. ## rm -f "$LOC/$IDX.dat" ## rm -f "$LOC/$IDX.idn" ## java -cp "$CP" -server "$PKG".CmdIndexBuild "$LOC" "$IDX" "$LOC/$IDX-txt" ## done ## # ---- Clean up. rm -f "$DATA_TRIPLES" "$DATA_QUADS" # ---- End log "-- TDB Bulk Loader Finish" ELAPSED=$(($TIME2-$TIME1)) log "-- $ELAPSED seconds"