#!/usr/bin/perl -w #======================================================================= # $Id$ # Cleans up files before they are going into the repository. # # Character set: UTF-8 # Licensed under the GNU General Public License version 2 or later. #======================================================================= use strict; $| = 1; use Getopt::Std; our ($opt_e, $opt_h) = ( 0, 0); getopts('eh') || die("Option error. Use -h for help.\n"); my $SEP = "ﳢ"; $opt_h && usage(0); my $Data = join("", <>); if ($opt_e) { # Prepare the file for editing or merging. # Convert all numeric entities to UTF-8. $Data =~ s/&#(\d{1,10});/widechar($1)/ge; $Data =~ s/&#x([0-9a-f]{1,8});/widechar(hex($1))/gei; # Remove all the funny chars between double dashes in comments. $Data =~ s/ﳢ//g; } else { # Prepare the file for commit or building. # Insert a character from the Unicode private use area between # double dashes in the English text. In the memory of the CBM-64 # we’ll use "ﳢ" — U+FCE2, also known as ﳢ. Necessary to stuff # something between the dashes because it’s forbidden to use double # dashes inside comments. ☠. $Data =~ s/()/ sprintf("%s%s%s", $1, cleanup_english($2), $3)/ges; # Convert all UTF-8 sequences into numerical entities. To make # RFC 3629 happy, only use four bytes. $Data =~ s/([\xF0-\xF7][\x80-\xBF][\x80-\xBF][\x80-\xBF])/ decode_char($1)/ge; $Data =~ s/([\xE0-\xEF][\x80-\xBF][\x80-\xBF])/ decode_char($1)/ge; $Data =~ s/([\xC0-\xDF][\x80-\xBF])/ decode_char($1)/ge; if ($Data =~ /[\x80-\xFF]/) { die("Invalid UTF-8 in stream, aborting.\n"); } } # And then finally dump the whole thing to stdout. print($Data); exit 0; sub cleanup_english { # Do necessary stuff to the English text. {{{ my $Txt = shift; $Txt =~ s/--/-$SEP-/g; $Txt =~ s/--/-$SEP-/g; return($Txt); # }}} } sub decode_char { # Converts an UTF-8 sequence into a numerical SGML/HTML/XML entity. # {{{ my $Msg = shift; my $Val = ""; # Accept invalid sequences (overlong sequences and surrogates) my $allow_invalid = 0; # Use base 10 in entities. ☹ my $use_decimal = 1; # Treat U+0080 through U+00FF as ISO-8859-1 characters my $use_latin1 = 0; if ($Msg =~ /^([\x20-\x7F])$/) { $Val = ord($1); } elsif ($Msg =~ /^([\xC0-\xDF])([\x80-\xBF])/) { if (!$allow_invalid && $Msg =~ /^[\xC0-\xC1]/) { $Val = 0xFFFD; } else { $Val = ((ord($1) & 0x1F) << 6) | (ord($2) & 0x3F); } } elsif ($Msg =~ /^([\xE0-\xEF])([\x80-\xBF])([\x80-\xBF])/) { if (!$allow_invalid && $Msg =~ /^\xE0[\x80-\x9F]/) { # Overlong sequence $Val = 0xFFFD; } else { $Val = ((ord($1) & 0x0F) << 12) | ((ord($2) & 0x3F) << 6) | ( ord($3) & 0x3F); } } elsif ($Msg =~ /^([\xF0-\xF7])([\x80-\xBF]) ([\x80-\xBF])([\x80-\xBF])/x) { if (!$allow_invalid && $Msg =~ /^\xF0[\x80-\x8F]/) { # Overlong sequence $Val = 0xFFFD; } else { $Val = ((ord($1) & 0x07) << 18) | ((ord($2) & 0x3F) << 12) | ((ord($3) & 0x3F) << 6) | ( ord($4) & 0x3F); } } elsif ($Msg =~ /^([\xF8-\xFB])([\x80-\xBF]) ([\x80-\xBF])([\x80-\xBF])([\x80-\xBF])/x) { if (!$allow_invalid && $Msg =~ /^\xF8[\x80-\x87]/) { # Overlong sequence $Val = 0xFFFD; } else { $Val = ((ord($1) & 0x03) << 24) | ((ord($2) & 0x3F) << 18) | ((ord($3) & 0x3F) << 12) | ((ord($4) & 0x3F) << 6) | ( ord($5) & 0x3F); } } elsif ($Msg =~ /^([\xFC-\xFD])([\x80-\xBF])([\x80-\xBF]) ([\x80-\xBF])([\x80-\xBF])([\x80-\xBF])/x) { if (!$allow_invalid && $Msg =~ /^\xFC[\x80-\x83]/) { # Overlong sequence $Val = 0xFFFD; } else { $Val = ((ord($1) & 0x01) << 30) | ((ord($2) & 0x3F) << 24) | ((ord($3) & 0x3F) << 18) | ((ord($4) & 0x3F) << 12) | ((ord($5) & 0x3F) << 6) | ( ord($6) & 0x3F); } } unless ($allow_invalid) { if ( ($Val >= 0xD800 && $Val <= 0xDFFF) || ($Val eq 0xFFFE) || ($Val eq 0xFFFF) ) { # Surrogate char or non-existing chars U+FFFE or U+FFFF $Val = 0xFFFD; } } return ($use_latin1 && ($Val <= 0xFF)) ? chr($Val) : sprintf(($use_decimal ? "&#%u;" : "&#x%X;"), $Val ); # }}} } # decode_char() sub widechar { # Convert numeric value into the corresponding UTF-8 sequence {{{ my $Val = shift; # Allow Invalid character range U+D800 through U+DFFF, U+FFFE and # U+FFFF. my $allow_invalid = 0; if ($Val < 0x80) { return sprintf("%c", $Val); } elsif ($Val < 0x800) { return sprintf("%c%c", 0xC0 | ($Val >> 6), 0x80 | ($Val & 0x3F) ); } elsif ($Val < 0x10000) { unless ($allow_invalid) { if (($Val >= 0xD800 && $Val <= 0xDFFF) || ($Val eq 0xFFFE) || ($Val eq 0xFFFF)) { $Val = 0xFFFD; } } return sprintf("%c%c%c", 0xE0 | ($Val >> 12), 0x80 | (($Val >> 6) & 0x3F), 0x80 | ($Val & 0x3F) ); } elsif ($Val < 0x200000) { return sprintf("%c%c%c%c", 0xF0 | ($Val >> 18), 0x80 | (($Val >> 12) & 0x3F), 0x80 | (($Val >> 6) & 0x3F), 0x80 | ($Val & 0x3F) ); } elsif ($Val < 0x4000000) { return sprintf("%c%c%c%c%c", 0xF8 | ($Val >> 24), 0x80 | (($Val >> 18) & 0x3F), 0x80 | (($Val >> 12) & 0x3F), 0x80 | (($Val >> 6) & 0x3F), 0x80 | ( $Val & 0x3F) ); } elsif ($Val < 0x80000000) { return sprintf("%c%c%c%c%c%c", 0xFC | ($Val >> 30), 0x80 | (($Val >> 24) & 0x3F), 0x80 | (($Val >> 18) & 0x3F), 0x80 | (($Val >> 12) & 0x3F), 0x80 | (($Val >> 6) & 0x3F), 0x80 | ( $Val & 0x3F) ); } else { return widechar(0xFFFD); } # }}} } # widechar() sub usage { # Good ol’ help screen. {{{ my $Retval = shift; print(<