#!/bin/bash ## Licensed to the Apache Software Foundation (ASF) under one ## or more contributor license agreements. See the NOTICE file ## distributed with this work for additional information ## regarding copyright ownership. The ASF licenses this file ## to you under the Apache License, Version 2.0 (the ## "License"); you may not use this file except in compliance ## with the License. You may obtain a copy of the License at ## ## http://www.apache.org/licenses/LICENSE-2.0 ## ## Unless required by applicable law or agreed to in writing, software ## distributed under the License is distributed on an "AS IS" BASIS, ## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ## See the License for the specific language governing permissions and ## limitations under the License. # Exit on error. set -e # Sort order is ASCII export LC_LOCALE="C" log() { echo " $(date $DATE)" "$@" ; } TMP=$$ #DATE="+%Y-%m-%dT%H:%M:%S%:z" DATE="+%H:%M:%S" CP="$($TDBROOT/bin/tdb_path $TDBROOT)" USAGE="Usage: $(basename $0) --loc location datafile ..." PKG=com.hp.hpl.jena.tdb.store.bulkloader2 if [ "$#" -lt 2 ] ; then echo "$USAGE" 1>&2 ; exit 1 ; fi ## Process --loc. Yuk. ARG1="$1" shift if [ "$ARG1" = "-loc" -o "$ARG1" = "--loc" ] then LOC="$1" shift else LOC="${ARG1/-*loc=/}" if [ "$ARG1" = "$LOC" ] ; then echo $USAGE 1>&2 ; exit 1 ; fi fi if [ ! -e "$LOC" ] ; then mkdir "$LOC" ; fi if [ ! -d "$LOC" ] ; then echo "Not a directory: $LOC" ; exit 1 ; fi FILES="$@" ## Stdin? KEEPWORKFILES="${KEEPWORKFILES:-}" # ---- Start log "-- TDB Bulk Loader Start" TIME1="$(date +%s)" # ---- Data loading phase log "Data phase" # Produce nodes file and triples/quads text file. DATA_TRIPLES="$LOC/data-triples.$TMP" DATA_QUADS="$LOC/data-quads.$TMP" java -Xmx1200M -cp "$CP" -server "$PKG".CmdNodeTableBuilder \ "--loc=$LOC" "--triples=$DATA_TRIPLES" "--quads=$DATA_QUADS" $FILES # ---- Index intermediates ## All files are writtern S P O / G S P O columns per row but in different sort orders. log "Index phase" process_rows() { local KEYS="$1" local DATA="$2" local IDX=$3 local WORK="$LOC/$IDX-txt" if [ ! -s "$DATA" ] then return fi log "Index $IDX" sort -u $KEYS < "$DATA" > $WORK log "Build $IDX" rm -f "$LOC/$IDX.dat" rm -f "$LOC/$IDX.idn" java -cp "$CP" -server "$PKG".CmdIndexBuild "$LOC" "$IDX" "$WORK" # Remove intermediary file. if [ "$KEEPWORKFILES" != "yes" ] then rm "$WORK" fi } K1="-k 1,1" K2="-k 2,2" K3="-k 3,3" K4="-k 4,4" process_rows "$K1 $K2 $K3" "$DATA_TRIPLES" SPO process_rows "$K2 $K3 $K1" "$DATA_TRIPLES" POS process_rows "$K3 $K1 $K2" "$DATA_TRIPLES" OSP process_rows "$K1 $K2 $K3 $K4" "$DATA_QUADS" GSPO process_rows "$K1 $K3 $K4 $K2" "$DATA_QUADS" GPOS process_rows "$K1 $K4 $K2 $K3" "$DATA_QUADS" GOSP process_rows "$K2 $K3 $K4 $K1" "$DATA_QUADS" SPOG process_rows "$K3 $K4 $K2 $K1" "$DATA_QUADS" POSG process_rows "$K4 $K2 $K3 $K1" "$DATA_QUADS" OSPG log "Index phase end" TIME2="$(date +%s)" # ---- Clean up. if [ "$KEEPWORKFILES" != "yes" ] then rm -f "$DATA_TRIPLES" "$DATA_QUADS" fi # ---- End log "-- TDB Bulk Loader Finish" ELAPSED=$(($TIME2-$TIME1)) log "-- $ELAPSED seconds"