require 5; # The documentation is at the end. # Time-stamp: "2004-05-07 15:31:25 ADT" package Pod::Escapes; require Exporter; @ISA = ('Exporter'); $VERSION = '1.04'; @EXPORT_OK = qw( %Code2USASCII %Name2character %Name2character_number %Latin1Code_to_fallback %Latin1Char_to_fallback e2char e2charnum ); %EXPORT_TAGS = ('ALL' => \@EXPORT_OK); #========================================================================== use strict; use vars qw( %Code2USASCII %Name2character %Name2character_number %Latin1Code_to_fallback %Latin1Char_to_fallback $FAR_CHAR $FAR_CHAR_NUMBER $NOT_ASCII ); $FAR_CHAR = "?" unless defined $FAR_CHAR; $FAR_CHAR_NUMBER = ord($FAR_CHAR) unless defined $FAR_CHAR_NUMBER; $NOT_ASCII = 'A' ne chr(65) unless defined $NOT_ASCII; #-------------------------------------------------------------------------- sub e2char { my $in = $_[0]; return undef unless defined $in and length $in; # Convert to decimal: if($in =~ m/^(0[0-7]*)$/s ) { $in = oct $in; } elsif($in =~ m/^0?x([0-9a-fA-F]+)$/s ) { $in = hex $1; } # else it's decimal, or named if($NOT_ASCII) { # We're in bizarro world of not-ASCII! # Cope with US-ASCII codes, use fallbacks for Latin-1, or use FAR_CHAR. unless($in =~ m/^\d+$/s) { # It's a named character reference. Get its numeric Unicode value. $in = $Name2character{$in}; return undef unless defined $in; # (if there's no such name) $in = ord $in; # (All ents must be one character long.) # ...So $in holds the char's US-ASCII numeric value, which we'll # now go get the local equivalent for. } # It's numeric, whether by origin or by mutation from a known name return $Code2USASCII{$in} # so "65" => "A" everywhere || $Latin1Code_to_fallback{$in} # Fallback. || $FAR_CHAR; # Fall further back } # Normal handling: if($in =~ m/^\d+$/s) { if($] < 5.007 and $in > 255) { # can't be trusted with Unicode return $FAR_CHAR; } else { return chr($in); } } else { return $Name2character{$in}; # returns undef if unknown } } #-------------------------------------------------------------------------- sub e2charnum { my $in = $_[0]; return undef unless defined $in and length $in; # Convert to decimal: if($in =~ m/^(0[0-7]*)$/s ) { $in = oct $in; } elsif($in =~ m/^0?x([0-9a-fA-F]+)$/s ) { $in = hex $1; } # else it's decimal, or named if($in =~ m/^\d+$/s) { return 0 + $in; } else { return $Name2character_number{$in}; # returns undef if unknown } } #-------------------------------------------------------------------------- %Name2character_number = ( # General XML/XHTML: 'lt' => 60, 'gt' => 62, 'quot' => 34, 'amp' => 38, 'apos' => 39, # POD-specific: 'sol' => 47, 'verbar' => 124, 'lchevron' => 171, # legacy for laquo 'rchevron' => 187, # legacy for raquo # Remember, grave looks like \ (as in virtu\) # acute looks like / (as in re/sume/) # circumflex looks like ^ (as in papier ma^che/) # umlaut/dieresis looks like " (as in nai"ve, Chloe") # From the XHTML 1 .ent files: 'nbsp' , 160, 'iexcl' , 161, 'cent' , 162, 'pound' , 163, 'curren' , 164, 'yen' , 165, 'brvbar' , 166, 'sect' , 167, 'uml' , 168, 'copy' , 169, 'ordf' , 170, 'laquo' , 171, 'not' , 172, 'shy' , 173, 'reg' , 174, 'macr' , 175, 'deg' , 176, 'plusmn' , 177, 'sup2' , 178, 'sup3' , 179, 'acute' , 180, 'micro' , 181, 'para' , 182, 'middot' , 183, 'cedil' , 184, 'sup1' , 185, 'ordm' , 186, 'raquo' , 187, 'frac14' , 188, 'frac12' , 189, 'frac34' , 190, 'iquest' , 191, 'Agrave' , 192, 'Aacute' , 193, 'Acirc' , 194, 'Atilde' , 195, 'Auml' , 196, 'Aring' , 197, 'AElig' , 198, 'Ccedil' , 199, 'Egrave' , 200, 'Eacute' , 201, 'Ecirc' , 202, 'Euml' , 203, 'Igrave' , 204, 'Iacute' , 205, 'Icirc' , 206, 'Iuml' , 207, 'ETH' , 208, 'Ntilde' , 209, 'Ograve' , 210, 'Oacute' , 211, 'Ocirc' , 212, 'Otilde' , 213, 'Ouml' , 214, 'times' , 215, 'Oslash' , 216, 'Ugrave' , 217, 'Uacute' , 218, 'Ucirc' , 219, 'Uuml' , 220, 'Yacute' , 221, 'THORN' , 222, 'szlig' , 223, 'agrave' , 224, 'aacute' , 225, 'acirc' , 226, 'atilde' , 227, 'auml' , 228, 'aring' , 229, 'aelig' , 230, 'ccedil' , 231, 'egrave' , 232, 'eacute' , 233, 'ecirc' , 234, 'euml' , 235, 'igrave' , 236, 'iacute' , 237, 'icirc' , 238, 'iuml' , 239, 'eth' , 240, 'ntilde' , 241, 'ograve' , 242, 'oacute' , 243, 'ocirc' , 244, 'otilde' , 245, 'ouml' , 246, 'divide' , 247, 'oslash' , 248, 'ugrave' , 249, 'uacute' , 250, 'ucirc' , 251, 'uuml' , 252, 'yacute' , 253, 'thorn' , 254, 'yuml' , 255, 'fnof' , 402, 'Alpha' , 913, 'Beta' , 914, 'Gamma' , 915, 'Delta' , 916, 'Epsilon' , 917, 'Zeta' , 918, 'Eta' , 919, 'Theta' , 920, 'Iota' , 921, 'Kappa' , 922, 'Lambda' , 923, 'Mu' , 924, 'Nu' , 925, 'Xi' , 926, 'Omicron' , 927, 'Pi' , 928, 'Rho' , 929, 'Sigma' , 931, 'Tau' , 932, 'Upsilon' , 933, 'Phi' , 934, 'Chi' , 935, 'Psi' , 936, 'Omega' , 937, 'alpha' , 945, 'beta' , 946, 'gamma' , 947, 'delta' , 948, 'epsilon' , 949, 'zeta' , 950, 'eta' , 951, 'theta' , 952, 'iota' , 953, 'kappa' , 954, 'lambda' , 955, 'mu' , 956, 'nu' , 957, 'xi' , 958, 'omicron' , 959, 'pi' , 960, 'rho' , 961, 'sigmaf' , 962, 'sigma' , 963, 'tau' , 964, 'upsilon' , 965, 'phi' , 966, 'chi' , 967, 'psi' , 968, 'omega' , 969, 'thetasym' , 977, 'upsih' , 978, 'piv' , 982, 'bull' , 8226, 'hellip' , 8230, 'prime' , 8242, 'Prime' , 8243, 'oline' , 8254, 'frasl' , 8260, 'weierp' , 8472, 'image' , 8465, 'real' , 8476, 'trade' , 8482, 'alefsym' , 8501, 'larr' , 8592, 'uarr' , 8593, 'rarr' , 8594, 'darr' , 8595, 'harr' , 8596, 'crarr' , 8629, 'lArr' , 8656, 'uArr' , 8657, 'rArr' , 8658, 'dArr' , 8659, 'hArr' , 8660, 'forall' , 8704, 'part' , 8706, 'exist' , 8707, 'empty' , 8709, 'nabla' , 8711, 'isin' , 8712, 'notin' , 8713, 'ni' , 8715, 'prod' , 8719, 'sum' , 8721, 'minus' , 8722, 'lowast' , 8727, 'radic' , 8730, 'prop' , 8733, 'infin' , 8734, 'ang' , 8736, 'and' , 8743, 'or' , 8744, 'cap' , 8745, 'cup' , 8746, 'int' , 8747, 'there4' , 8756, 'sim' , 8764, 'cong' , 8773, 'asymp' , 8776, 'ne' , 8800, 'equiv' , 8801, 'le' , 8804, 'ge' , 8805, 'sub' , 8834, 'sup' , 8835, 'nsub' , 8836, 'sube' , 8838, 'supe' , 8839, 'oplus' , 8853, 'otimes' , 8855, 'perp' , 8869, 'sdot' , 8901, 'lceil' , 8968, 'rceil' , 8969, 'lfloor' , 8970, 'rfloor' , 8971, 'lang' , 9001, 'rang' , 9002, 'loz' , 9674, 'spades' , 9824, 'clubs' , 9827, 'hearts' , 9829, 'diams' , 9830, 'OElig' , 338, 'oelig' , 339, 'Scaron' , 352, 'scaron' , 353, 'Yuml' , 376, 'circ' , 710, 'tilde' , 732, 'ensp' , 8194, 'emsp' , 8195, 'thinsp' , 8201, 'zwnj' , 8204, 'zwj' , 8205, 'lrm' , 8206, 'rlm' , 8207, 'ndash' , 8211, 'mdash' , 8212, 'lsquo' , 8216, 'rsquo' , 8217, 'sbquo' , 8218, 'ldquo' , 8220, 'rdquo' , 8221, 'bdquo' , 8222, 'dagger' , 8224, 'Dagger' , 8225, 'permil' , 8240, 'lsaquo' , 8249, 'rsaquo' , 8250, 'euro' , 8364, ); # Fill out %Name2character... { %Name2character = (); my($name, $number); while( ($name, $number) = each %Name2character_number) { if($] < 5.007 and $number > 255) { $Name2character{$name} = $FAR_CHAR; # substitute for Unicode characters, for perls # that can't reliable handle them } else { $Name2character{$name} = chr $number; # normal case } } # So they resolve 'right' even in EBCDIC-land $Name2character{'lt' } = '<'; $Name2character{'gt' } = '>'; $Name2character{'quot'} = '"'; $Name2character{'amp' } = '&'; $Name2character{'apos'} = "'"; $Name2character{'sol' } = '/'; $Name2character{'verbar'} = '|'; } #-------------------------------------------------------------------------- %Code2USASCII = ( # mostly generated by # perl -e "printf qq{ \x25 3s, '\x25s',\n}, $_, chr($_) foreach (32 .. 126)" 32, ' ', 33, '!', 34, '"', 35, '#', 36, '$', 37, '%', 38, '&', 39, "'", #! 40, '(', 41, ')', 42, '*', 43, '+', 44, ',', 45, '-', 46, '.', 47, '/', 48, '0', 49, '1', 50, '2', 51, '3', 52, '4', 53, '5', 54, '6', 55, '7', 56, '8', 57, '9', 58, ':', 59, ';', 60, '<', 61, '=', 62, '>', 63, '?', 64, '@', 65, 'A', 66, 'B', 67, 'C', 68, 'D', 69, 'E', 70, 'F', 71, 'G', 72, 'H', 73, 'I', 74, 'J', 75, 'K', 76, 'L', 77, 'M', 78, 'N', 79, 'O', 80, 'P', 81, 'Q', 82, 'R', 83, 'S', 84, 'T', 85, 'U', 86, 'V', 87, 'W', 88, 'X', 89, 'Y', 90, 'Z', 91, '[', 92, "\\", #! 93, ']', 94, '^', 95, '_', 96, '`', 97, 'a', 98, 'b', 99, 'c', 100, 'd', 101, 'e', 102, 'f', 103, 'g', 104, 'h', 105, 'i', 106, 'j', 107, 'k', 108, 'l', 109, 'm', 110, 'n', 111, 'o', 112, 'p', 113, 'q', 114, 'r', 115, 's', 116, 't', 117, 'u', 118, 'v', 119, 'w', 120, 'x', 121, 'y', 122, 'z', 123, '{', 124, '|', 125, '}', 126, '~', ); #-------------------------------------------------------------------------- %Latin1Code_to_fallback = (); @Latin1Code_to_fallback{0xA0 .. 0xFF} = ( # Copied from Text/Unidecode/x00.pm: ' ', qq{!}, qq{C/}, 'PS', qq{\$?}, qq{Y=}, qq{|}, 'SS', qq{"}, qq{(c)}, 'a', qq{<<}, qq{!}, "", qq{(r)}, qq{-}, 'deg', qq{+-}, '2', '3', qq{'}, 'u', 'P', qq{*}, qq{,}, '1', 'o', qq{>>}, qq{1/4}, qq{1/2}, qq{3/4}, qq{?}, 'A', 'A', 'A', 'A', 'A', 'A', 'AE', 'C', 'E', 'E', 'E', 'E', 'I', 'I', 'I', 'I', 'D', 'N', 'O', 'O', 'O', 'O', 'O', 'x', 'O', 'U', 'U', 'U', 'U', 'U', 'Th', 'ss', 'a', 'a', 'a', 'a', 'a', 'a', 'ae', 'c', 'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i', 'd', 'n', 'o', 'o', 'o', 'o', 'o', qq{/}, 'o', 'u', 'u', 'u', 'u', 'y', 'th', 'y', ); { # Now stuff %Latin1Char_to_fallback: %Latin1Char_to_fallback = (); my($k,$v); while( ($k,$v) = each %Latin1Code_to_fallback) { $Latin1Char_to_fallback{chr $k} = $v; #print chr($k), ' => ', $v, "\n"; } } #-------------------------------------------------------------------------- 1; __END__ =head1 NAME Pod::Escapes -- for resolving Pod EE...E sequences =head1 SYNOPSIS use Pod::Escapes qw(e2char); ...la la la, parsing POD, la la la... $text = e2char($e_node->label); unless(defined $text) { print "Unknown E sequence \"", $e_node->label, "\"!"; } ...else print/interpolate $text... =head1 DESCRIPTION This module provides things that are useful in decoding Pod EE...E sequences. Presumably, it should be used only by Pod parsers and/or formatters. By default, Pod::Escapes exports none of its symbols. But you can request any of them to be exported. Either request them individually, as with C, or you can do C to get all exportable symbols. =head1 GOODIES =over =item e2char($e_content) Given a name or number that could appear in a Cname_or_numE> sequence, this returns the string that it stands for. For example, C, C, C, and C all return "/", because CsolE>, C47E>, C0x2fE>, and C057E>, all mean "/". If the name has no known value (as with a name of "qacute") or is syntactally invalid (as with a name of "1/4"), this returns undef. =item e2charnum($e_content) Given a name or number that could appear in a Cname_or_numE> sequence, this returns the number of the Unicode character that this stands for. For example, C, C, C, and C all return 47, because CsolE>, C47E>, C0x2fE>, and C057E>, all mean "/", whose Unicode number is 47. If the name has no known value (as with a name of "qacute") or is syntactally invalid (as with a name of "1/4"), this returns undef. =item $Name2character{I} Maps from names (as in CIE>) like "eacute" or "sol" to the string that each stands for. Note that this does not include numerics (like "64" or "x981c"). Under old Perl versions (before 5.7) you get a "?" in place of characters whose Unicode value is over 255. =item $Name2character_number{I} Maps from names (as in CIE>) like "eacute" or "sol" to the Unicode value that each stands for. For example, C<$Name2character_number{'eacute'}> is 201, and C<$Name2character_number{'eacute'}> is 8364. You get the correct Unicode value, regardless of the version of Perl you're using -- which differs from C<%Name2character>'s behavior under pre-5.7 Perls. Note that this hash does not include numerics (like "64" or "x981c"). =item $Latin1Code_to_fallback{I} For numbers in the range 160 (0x00A0) to 255 (0x00FF), this maps from the character code for a Latin-1 character (like 233 for lowercase e-acute) to the US-ASCII character that best aproximates it (like "e"). You may find this useful if you are rendering POD in a format that you think deals well only with US-ASCII characters. =item $Latin1Char_to_fallback{I} Just as above, but maps from characters (like "\xE9", lowercase e-acute) to characters (like "e"). =item $Code2USASCII{I} This maps from US-ASCII codes (like 32) to the corresponding character (like space, for 32). Only characters 32 to 126 are defined. This is meant for use by C when it senses that it's running on a non-ASCII platform (where chr(32) doesn't get you a space -- but $Code2USASCII{32} will). It's documented here just in case you might find it useful. =back =head1 CAVEATS On Perl versions before 5.7, Unicode characters with a value over 255 (like lambda or emdash) can't be conveyed. This module does work under such early Perl versions, but in the place of each such character, you get a "?". Latin-1 characters (characters 160-255) are unaffected. Under EBCDIC platforms, C may not always be the same as C, and ditto for C<$Name2character{$name}> and C. =head1 SEE ALSO L L L =head1 COPYRIGHT AND DISCLAIMERS Copyright (c) 2001-2004 Sean M. Burke. All rights reserved. This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. This program is distributed in the hope that it will be useful, but without any warranty; without even the implied warranty of merchantability or fitness for a particular purpose. Portions of the data tables in this module are derived from the entity declarations in the W3C XHTML specification. Currently (October 2001), that's these three: http://www.w3.org/TR/xhtml1/DTD/xhtml-lat1.ent http://www.w3.org/TR/xhtml1/DTD/xhtml-special.ent http://www.w3.org/TR/xhtml1/DTD/xhtml-symbol.ent =head1 AUTHOR Sean M. Burke C =cut #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # What I used for reading the XHTML .ent files: use strict; my(@norms, @good, @bad); my $dir = 'c:/sgml/docbook/'; my %escapes; foreach my $file (qw( xhtml-symbol.ent xhtml-lat1.ent xhtml-special.ent )) { open(IN, "<$dir$file") or die "can't read-open $dir$file: $!"; print "Reading $file...\n"; while() { if(m//) { my($name, $value) = ($1,$2); next if $name eq 'quot' or $name eq 'apos' or $name eq 'gt'; $value = hex $1 if $value =~ m/^x([a-fA-F0-9]+)$/s; print "ILLEGAL VALUE $value" unless $value =~ m/^\d+$/s; if($value > 255) { push @good , sprintf " %-10s , chr(%s),\n", "'$name'", $value; push @bad , sprintf " %-10s , \$bad,\n", "'$name'", $value; } else { push @norms, sprintf " %-10s , chr(%s),\n", "'$name'", $value; } } elsif(m/