From e8e22da9cc38ada2bedbc64c880c508ce7877343 Mon Sep 17 00:00:00 2001 From: Jonathan Blake Date: Thu, 18 Jul 2019 20:46:01 -0700 Subject: [PATCH] Prepare XMLize for Unicode input --- common/scripts/xmlize.pl | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/common/scripts/xmlize.pl b/common/scripts/xmlize.pl index 277ceb0..ed75116 100755 --- a/common/scripts/xmlize.pl +++ b/common/scripts/xmlize.pl @@ -6,6 +6,8 @@ use strict; use warnings; +use utf8; +use open ':encoding(UTF-8)'; my $FILE_EXTENSION = 'txt'; @@ -118,20 +120,21 @@ print << "(End of XML footer)"; sub xmlize { my( $inline, $infile ) = @_; - $inline =~ s/[[:space:]]*(\.\.\.|\.\s\.\s\.)[[:space:]]*//g; $inline =~ tr/\t/ /; - $inline =~ s/\s{2,}/ /g; - $inline =~ s/\s+$//; - $inline =~ s/\&\s//g; - $inline =~ tr/\"\`\222\221/\'/; + $inline =~ s/[[:space:]]{2,}/ /g; + $inline =~ s/[[:space:]]+$//; + $inline =~ s/^[[:space:]]+//; + $inline =~ s/[[:space:]]*(\.\.\.|\.\s\.\s\.)[[:space:]]*//g; + + $inline =~ s/\&(?=[[:space:]])//g; + $inline =~ tr/\"\`/\'/; + $inline =~ s/[\N{U+2018}\N{U+201C}]//g; + $inline =~ s/[\N{U+2019}\N{U+201D}]/<\/quote>/g; + $inline =~ s/[\N{U+2014}]//g; + $inline =~ s/[\N{U+2014}]//g; + $inline =~ s/(Random\sNumber\sTable)/$1<\/a>/gi; $inline =~ s/(Action\sCharts?)/$1<\/a>/gi; - # \222 and \221 are some form of funky right and - # left quotes not present in ascii (of course) - $inline =~ tr/\227/-/; - # \227 is an em or en dash - - $inline =~ s/^\s*(.*)\s*$/$1/; if( $inline =~ /^\*/ ) { $inline =~ s/^\*\s*/
    \n
  • /; @@ -162,11 +165,11 @@ sub xmlize { $inline = " $inline"; $inline =~ s/\s+<\/signpost>/<\/signpost>/; } - elsif( $inline eq "" ) { - } elsif( $inline =~ /^/ ) { warn( "Warning: unknown comment \"$1\" in \"$infile\"\n" ); } + elsif( $inline eq "" ) { + } else { $inline = "

    $inline

    "; $inline =~ s/\s+<\/p>/<\/p>/; -- 2.34.1