common/scripts/checkerrata.pl

   1 #!/usr/bin/perl -w
   2 # This script parses the errata list of a book in XML form and tries to
   3 # locate the added or replaced strings in the sections where they are now
   4 # supposed to exist. Things that are not found by the simple errata list
   5 # parsing code are just skipped. Nesting frontmatter sections are also
   6 # skipped.
   7
   8 # Old History:
   9 #  2005-08-23 Tried to improve handling of <link-text> tags
  10 #  2005-03-12 Treat &thinspace; correctly.
  11 #  2003-03-16 Fixed reporting of false positived due to <link-text> tags
  12 #  2003-03-15 First version.
  13
  14 $xmlfile = shift @ARGV;
  15 $xmlfile || die("usage: $0 <xml file>\n");
  16 $invalid = shift @ARGV;
  17 $invalid && die("usage: $0 <xml file>\n");
  18
  19 open(ERRATA, $xmlfile) || die("Cannot read input file $xmlfile\n");
  20 open(SECTIONS, $xmlfile) || die("Cannot read input file $xmlfile\n");
  21
  22 while(<ERRATA>) { # find errata list
  23   last if m/<section.*id="errerr".*>/;
  24 }
  25 die("No errata section!? Aborting.\n") if eof(ERRATA);
  26 while(<ERRATA>) { # find start of actual list
  27   last if m/<data>/;
  28 }
  29 die("No data in the errata section!? Aborting.\n") if eof(ERRATA);
  30
  31 print("No matches for these errata entries were found:\n");
  32
  33 # Now pick each paragraph (= errata) line
  34 ERRATALOOP: while(<ERRATA>) {
  35   next unless m/^\s*<p>\(<a\s+idref=".+">.+<\/a>\)/; # no paragraph = not an errata line
  36   last if m|</data>|; # end of the errata list
  37   ($sect) = m|\(<a (?:id="\w+?" )?idref="(\w+?)">.*</a>\)|;
  38   defined($sect) || die("Failed on line: $_");
  39   @reps = m|<quote>.*?</quote> with <quote>(.*?)</quote>|g;
  40   @adds = m|[Aa]dded <quote>(.*?(?=</quote>))</quote>|g;
  41   next unless @reps || @adds;
  42
  43   # Now find section and append all contents into one string
  44   while(<SECTIONS>) { # locate section
  45     last if m/<section.*id=\"$sect\">/;
  46   }
  47   die("Could not find section $sect!? (This might be because the errata\nentry for it is not placed in the correct section order.) Aborting.\n") if eof(SECTIONS);
  48   while(<SECTIONS>) { # locate data
  49     last if m/<data>/;
  50   }
  51   die("Could not find any data in $sect!? Aborting.\n") if eof(SECTIONS);
  52   $text = "";
  53   while(<SECTIONS>) { # grab all section contents
  54     last if m|</data>|;
  55     if (m/<section/) { # oh no, nested sections - we cannot handle this well
  56       seek(SECTIONS, -length(), 1); # give the next section a chance
  57       next ERRATALOOP; # give up on this section
  58     }
  59     # Before adding the text to the section blob, modify <link-text> tags
  60     # since they are commented out in the errata entries
  61     s|<(/?)link-text>|<!--${1}link-text-->|g;
  62     # The following line substitutes a colon for </enemy> tag since the
  63     # colon is not in the xml but is added during the conversion process.
  64     # Some enemies, notably in dotd.xml, were missing a colon after the
  65     # enemy declaration, which we have added in the PA editions.
  66     s|</enemy>|:|g;
  67     $text .= $_;
  68   }
  69   die("Could not find the end of $sect!? Aborting.\n") if eof(SECTIONS);
  70
  71   # The replacement may contain left/right quotes which in the sections are
  72   # <quote> thingies. Translate these. Also ignore &thinspace; first and
  73   # last in the replacements.
  74   # Refactor the duplicated code below some day!
  75   foreach $rep (@reps) {
  76     # Contemporary character entities
  77     $rep =~ s/<ch.l[sd]quot\/>/<quote>/g;
  78     $rep =~ s/<ch.r[sd]quot\/>/<\/quote>/g;
  79     $rep =~ s/^<ch.thinspace\/>//g;
  80     $rep =~ s/<ch.thinspace\/>$//g;
  81     # (Obsolete) character elements
  82     $rep =~ s/\&l[sd]quot;/<quote>/g;
  83     $rep =~ s/\&r[sd]quot;/<\/quote>/g;
  84     $rep =~ s/^\&thinspace;//g;
  85     $rep =~ s/\&thinspace;$//g;
  86
  87     if ($text !~ m/\Q$rep\E/) {
  88       print("Replacement \"$rep\" in $sect\n");
  89     }
  90   }
  91   foreach $add (@adds) {
  92     $add =~ s/<ch.l[sd]quot\/>/<quote>/g;
  93     $add =~ s/<ch.r[sd]quot\/>/<\/quote>/g;
  94     $add =~ s/^<ch.thinspace\/>//g;
  95     $add =~ s/<ch.thinspace\/>$//g;
  96     $add =~ s/\&l[sd]quot;/<quote>/g;
  97     $add =~ s/\&r[sd]quot;/<\/quote>/g;
  98     $add =~ s/^\&thinspace;//g;
  99     $add =~ s/\&thinspace;$//g;
 100
 101     if ($text !~ m/\Q$add\E/) {
 102       print("Addition    \"$add\" in $sect\n");
 103     }
 104   }
 105 }
 106 print("Checking finished!\n");