#!/bin/bash # # Parses {1} - an XML dump of a wiki as produced by http://download.wikimedia.org/ services - and produces # a single .txt file for each article entry, saving them in folder {2} # Txt file are named is the article name with few char replaces. An additional file is produces which contains the originl wiki URI according to $wprefix # Eg # Article_Name.txt # Article_Name.uri # # Skips articles not in the 'main' namespace # # rdom () { local IFS=\> ; read -d \< E C ;} outdir=${2} if [[ $outdir = "" ]]; then outdir="./" fi outdir=`cd $outdir; pwd;` wprefix=http://en.wikinews.org/wiki/ wtitle="" wtext="" wuri="" while rdom; do if [[ $E = title ]]; then wtitle=$C fi #echo $E if [[ $E == text\ * ]]; then wtext=$C fi #echo "$wtitle $wtext" if [[ $wtitle != "" && $wtext != "" ]]; then # ignore all articles starting with a namespace, which means it is not a content article if [[ $wtitle =~ \: ]]; then echo Skipped $wtitle else title=`echo $wtitle|sed -e 's/[\ ]/_/g'` fname=`echo $wtitle|sed -e 's/[\ \/\:]/_/g'` wuri="$wprefix$title" furi=$outdir/$fname.uri fname=$outdir/$fname.txt echo "Parsed: $wuri" echo "$wtitle $wtext">>"$fname" echo "$wuri">>"$furi" fi wtitle="" wtext="" wuri="" fi done < ${1}