#!/bin/bash
#
# Parses {1} - an XML dump of a wiki as produced by http://download.wikimedia.org/ services - and produces
# a single .txt file for each article entry, saving them in folder {2}
# Txt file are named is the article name with few char replaces. An additional file is produces which contains the originl wiki URI according to $wprefix
# Eg
# Article_Name.txt
# Article_Name.uri
#
# Skips articles not in the 'main' namespace
#
#
rdom () { local IFS=\> ; read -d \< E C ;}
outdir=${2}

if [[ $outdir = "" ]]; then
  outdir="./"
fi
outdir=`cd $outdir; pwd;`

wprefix=http://en.wikinews.org/wiki/
wtitle=""
wtext=""
wuri=""
while rdom; do
    if [[ $E = title ]]; then
       wtitle=$C
    fi
    #echo $E
    if [[ $E == text\ * ]]; then
       wtext=$C
    fi
    #echo "$wtitle $wtext"
    if [[ $wtitle != "" && $wtext != "" ]]; then
	# ignore all articles starting with a namespace, which means it is not a content article
	if [[ $wtitle =~ \: ]]; then
		echo Skipped $wtitle
	else

		title=`echo $wtitle|sed -e 's/[\ ]/_/g'`
		fname=`echo $wtitle|sed -e 's/[\ \/\:]/_/g'`
		wuri="$wprefix$title"
                furi=$outdir/$fname.uri
		fname=$outdir/$fname.txt
                echo "Parsed: $wuri"
		echo "$wtitle

$wtext">>"$fname"
		echo "$wuri">>"$furi"
	fi
	wtitle=""
	wtext=""
	wuri=""
    fi
done < ${1}