Changeset 2414
- Timestamp:
- 01/22/10 10:24:43 (2 years ago)
- Location:
- Search-Tools/trunk
- Files:
-
- 7 edited
-
Changes (modified) (1 diff)
-
Makefile.PL (modified) (1 diff)
-
Tools.xs (modified) (1 diff)
-
lib/Search/Tools/Transliterate.pm (modified) (4 diffs)
-
lib/Search/Tools/UTF8.pm (modified) (3 diffs)
-
t/20transliterate-map.t (modified) (1 diff)
-
t/28-byte-length.t (modified) (1 diff)
Legend:
- Unmodified
- Added
- Removed
-
Search-Tools/trunk/Changes
r2410 r2414 218 218 * add support for wildcard at start of term in addition to end of term. 219 219 * added looks_like_win1252() to UTF8 class and convert1252() to Transliterate. 220 * added Encoding::FixLatin as a dependency. 220 221 * fix off-by-one errors in find_bad_*_report and find_bad_* UTF8 functions. 221 * add debug_bytes _in_string() to UTF8 class.222 * add debug_bytes() to UTF8 class. -
Search-Tools/trunk/Makefile.PL
r2282 r2414 26 26 'File::Slurp' => 0, 27 27 'Test::More' => 0.94, 28 'Encoding::FixLatin' => 0, 28 29 29 30 #'Text::Aspell' => '0.06', # optional -
Search-Tools/trunk/Tools.xs
r2410 r2414 131 131 132 132 void 133 debug_bytes _in_string(string)133 debug_bytes(string) 134 134 SV* string; 135 135 -
Search-Tools/trunk/lib/Search/Tools/Transliterate.pm
r2412 r2414 6 6 use Carp; 7 7 use Encode; 8 use Encoding::FixLatin qw( fix_latin ); 8 9 use Data::Dump qw( dump ); 9 10 … … 85 86 B<0x80> and B<0x9f> inclusive. The 1252 codepoints are converted first to 86 87 their UTF-8 counterparts per http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT 87 and then I<text> is run through convert(). 88 using Encoding::FixLatin and then I<text> is run through convert(). 89 90 Note that I<text> is checked with the looks_like_win1252() function from 91 Search::Tools::UTF8 before calling fix_latin(). 88 92 89 93 =head1 BUGS … … 252 256 } 253 257 254 my %win1252 = (255 "\x80" => "\x{20AC}", #EURO SIGN256 257 #\x81 #UNDEFINED258 "\x82" => "\x{201A}", #SINGLE LOW-9 QUOTATION MARK259 "\x83" => "\x{0192}", #LATIN SMALL LETTER F WITH HOOK260 "\x84" => "\x{201E}", #DOUBLE LOW-9 QUOTATION MARK261 "\x85" => "\x{2026}", #HORIZONTAL ELLIPSIS262 "\x86" => "\x{2020}", #DAGGER263 "\x87" => "\x{2021}", #DOUBLE DAGGER264 "\x88" => "\x{02C6}", #MODIFIER LETTER CIRCUMFLEX ACCENT265 "\x89" => "\x{2030}", #PER MILLE SIGN266 "\x8A" => "\x{0160}", #LATIN CAPITAL LETTER S WITH CARON267 "\x8B" => "\x{2039}", #SINGLE LEFT-POINTING ANGLE QUOTATION MARK268 "\x8C" => "\x{0152}", #LATIN CAPITAL LIGATURE OE269 270 #\x8D #UNDEFINED271 "\x8E" => "\x{017D}", #LATIN CAPITAL LETTER Z WITH CARON272 273 #\x8F #UNDEFINED274 #\x90 #UNDEFINED275 "\x91" => "\x{2018}", #LEFT SINGLE QUOTATION MARK276 "\x92" => "\x{2019}", #RIGHT SINGLE QUOTATION MARK277 "\x93" => "\x{201C}", #LEFT DOUBLE QUOTATION MARK278 "\x94" => "\x{201D}", #RIGHT DOUBLE QUOTATION MARK279 "\x95" => "\x{2022}", #BULLET280 "\x96" => "\x{2013}", #EN DASH281 "\x97" => "\x{2014}", #EM DASH282 "\x98" => "\x{02DC}", #SMALL TILDE283 "\x99" => "\x{2122}", #TRADE MARK SIGN284 "\x9A" => "\x{0161}", #LATIN SMALL LETTER S WITH CARON285 "\x9B" => "\x{203A}", #SINGLE RIGHT-POINTING ANGLE QUOTATION MARK286 "\x9C" => "\x{0153}", #LATIN SMALL LIGATURE OE287 288 #\x9D #UNDEFINED289 "\x9E" => "\x{017E}", #LATIN SMALL LETTER Z WITH CARON290 "\x9F" => "\x{0178}", #LATIN CAPITAL LETTER Y WITH DIAERESIS291 292 );293 294 258 sub convert1252 { 295 259 my ( $self, $buf ) = @_; 296 my $newbuf = '';297 260 298 261 # don't bother unless we have non-ascii bytes … … 300 263 301 264 $self->debug and warn "converting $buf\n"; 302 while ( $buf =~ m/(.)/gox ) { 303 my $char = $1; 304 $self->debug and warn "$char\n"; 305 if ( is_ascii($char) ) { 306 $self->debug and warn "$char is_ascii\n"; 307 $newbuf .= $char; 308 } 309 elsif ( exists $win1252{$char} ) { 310 $self->debug and warn "$char is win1252\n"; 311 $newbuf .= $win1252{$char}; 312 } 313 else { 314 $self->debug and warn "append $char\n"; 315 $newbuf .= $char; 316 } 317 318 } 319 265 my $newbuf = looks_like_win1252($buf) ? fix_latin($buf) : $buf; 320 266 return $self->convert($newbuf); 321 267 } -
Search-Tools/trunk/lib/Search/Tools/UTF8.pm
r2411 r2414 20 20 byte_length 21 21 looks_like_win1252 22 debug_bytes _in_string22 debug_bytes 23 23 ); 24 24 … … 134 134 if ( !is_latin1( $_[0] ) 135 135 && !is_ascii( $_[0] ) 136 && $_[0] =~ m/[\x 80-\x9f]/ )136 && $_[0] =~ m/[\x00-\x7f]?[\x80-\x9f][\x00-\x7f]?/ ) 137 137 { 138 138 return 1; … … 264 264 See also the Search::Tools::Transliterate convert1252() method. 265 265 266 =head2 debug_bytes _in_string( I<text> )266 =head2 debug_bytes( I<text> ) 267 267 268 268 Iterates over each byte in I<text>, printing byte, hex and decimal values -
Search-Tools/trunk/t/20transliterate-map.t
r2411 r2414 41 41 42 42 #$tr->debug(1); 43 my $win1252 = " \xcf\x80\x82\x83\x91\x92\x93\x94\x9f";44 my $utf8_not_1252 = to_utf8("\xcf"); # \xc3\x8f45 ok( looks_like_win1252($win1252), "looks_like_win1252" );46 ok( !looks_like_win1252($utf8_not_1252), "utf8 string !looks_like_win1252" );43 my $win1252 = "a\x{80}b\x{82}c\x{83}d\x{91}e\x{92}f\x{93}g"; 44 my $utf8_not_1252 = to_utf8("\xcf"); # \xc3\x8f 45 ok( looks_like_win1252($win1252), "looks_like_win1252" ); 46 ok( !looks_like_win1252($utf8_not_1252), "utf8 string !looks_like_win1252" ); 47 47 ok( my $win1252_conv = $tr->convert1252($win1252), "convert1252" ); 48 48 49 #debug_bytes_in_string($win1252); 50 #debug_bytes_in_string($utf8_not_1252); 49 #diag("win1252"); 50 #debug_bytes($win1252_conv); 51 52 #debug_bytes($utf8_not_1252); 51 53 52 54 #diag( dump $win1252_conv ); 53 is( $win1252_conv, qq{ IEUR'f''""Y}, "transliterate 1252" );55 is( $win1252_conv, qq{aEURb'cfd'e'f"g}, "transliterate 1252" ); 54 56 55 57 1; -
Search-Tools/trunk/t/28-byte-length.t
r2410 r2414 30 30 31 31 diag("astr: $astr"); 32 debug_bytes _in_string($astr);32 debug_bytes($astr); 33 33 34 34 diag("bstr: $bstr"); 35 debug_bytes _in_string($bstr);35 debug_bytes($bstr);
Note: See TracChangeset
for help on using the changeset viewer.