Changeset 2414


Ignore:
Timestamp:
01/22/10 10:24:43 (2 years ago)
Author:
karpet
Message:

use Encoding::FixLatin? instead of our own code

Location:
Search-Tools/trunk
Files:
7 edited

Legend:

Unmodified
Added
Removed
  • Search-Tools/trunk/Changes

    r2410 r2414  
    218218    * add support for wildcard at start of term in addition to end of term. 
    219219    * added looks_like_win1252() to UTF8 class and convert1252() to Transliterate. 
     220    * added Encoding::FixLatin as a dependency. 
    220221    * fix off-by-one errors in find_bad_*_report and find_bad_* UTF8 functions. 
    221     * add debug_bytes_in_string() to UTF8 class. 
     222    * add debug_bytes() to UTF8 class. 
  • Search-Tools/trunk/Makefile.PL

    r2282 r2414  
    2626        'File::Slurp'         => 0, 
    2727        'Test::More'          => 0.94, 
     28        'Encoding::FixLatin'  => 0, 
    2829 
    2930        #'Text::Aspell'          => '0.06',  # optional 
  • Search-Tools/trunk/Tools.xs

    r2410 r2414  
    131131 
    132132void 
    133 debug_bytes_in_string(string) 
     133debug_bytes(string) 
    134134    SV* string; 
    135135 
  • Search-Tools/trunk/lib/Search/Tools/Transliterate.pm

    r2412 r2414  
    66use Carp; 
    77use Encode; 
     8use Encoding::FixLatin qw( fix_latin ); 
    89use Data::Dump qw( dump ); 
    910 
     
    8586B<0x80> and B<0x9f> inclusive. The 1252 codepoints are converted first to 
    8687their UTF-8 counterparts per http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT 
    87 and then I<text> is run through convert(). 
     88using Encoding::FixLatin and then I<text> is run through convert(). 
     89 
     90Note that I<text> is checked with the looks_like_win1252() function from 
     91Search::Tools::UTF8 before calling fix_latin(). 
    8892 
    8993=head1 BUGS 
     
    252256} 
    253257 
    254 my %win1252 = ( 
    255     "\x80" => "\x{20AC}",    #EURO SIGN 
    256  
    257     #\x81               #UNDEFINED 
    258     "\x82" => "\x{201A}",    #SINGLE LOW-9 QUOTATION MARK 
    259     "\x83" => "\x{0192}",    #LATIN SMALL LETTER F WITH HOOK 
    260     "\x84" => "\x{201E}",    #DOUBLE LOW-9 QUOTATION MARK 
    261     "\x85" => "\x{2026}",    #HORIZONTAL ELLIPSIS 
    262     "\x86" => "\x{2020}",    #DAGGER 
    263     "\x87" => "\x{2021}",    #DOUBLE DAGGER 
    264     "\x88" => "\x{02C6}",    #MODIFIER LETTER CIRCUMFLEX ACCENT 
    265     "\x89" => "\x{2030}",    #PER MILLE SIGN 
    266     "\x8A" => "\x{0160}",    #LATIN CAPITAL LETTER S WITH CARON 
    267     "\x8B" => "\x{2039}",    #SINGLE LEFT-POINTING ANGLE QUOTATION MARK 
    268     "\x8C" => "\x{0152}",    #LATIN CAPITAL LIGATURE OE 
    269  
    270     #\x8D               #UNDEFINED 
    271     "\x8E" => "\x{017D}",    #LATIN CAPITAL LETTER Z WITH CARON 
    272  
    273     #\x8F               #UNDEFINED 
    274     #\x90               #UNDEFINED 
    275     "\x91" => "\x{2018}",    #LEFT SINGLE QUOTATION MARK 
    276     "\x92" => "\x{2019}",    #RIGHT SINGLE QUOTATION MARK 
    277     "\x93" => "\x{201C}",    #LEFT DOUBLE QUOTATION MARK 
    278     "\x94" => "\x{201D}",    #RIGHT DOUBLE QUOTATION MARK 
    279     "\x95" => "\x{2022}",    #BULLET 
    280     "\x96" => "\x{2013}",    #EN DASH 
    281     "\x97" => "\x{2014}",    #EM DASH 
    282     "\x98" => "\x{02DC}",    #SMALL TILDE 
    283     "\x99" => "\x{2122}",    #TRADE MARK SIGN 
    284     "\x9A" => "\x{0161}",    #LATIN SMALL LETTER S WITH CARON 
    285     "\x9B" => "\x{203A}",    #SINGLE RIGHT-POINTING ANGLE QUOTATION MARK 
    286     "\x9C" => "\x{0153}",    #LATIN SMALL LIGATURE OE 
    287  
    288     #\x9D               #UNDEFINED 
    289     "\x9E" => "\x{017E}",    #LATIN SMALL LETTER Z WITH CARON 
    290     "\x9F" => "\x{0178}",    #LATIN CAPITAL LETTER Y WITH DIAERESIS 
    291  
    292 ); 
    293  
    294258sub convert1252 { 
    295259    my ( $self, $buf ) = @_; 
    296     my $newbuf = ''; 
    297260 
    298261    # don't bother unless we have non-ascii bytes 
     
    300263 
    301264    $self->debug and warn "converting $buf\n"; 
    302     while ( $buf =~ m/(.)/gox ) { 
    303         my $char = $1; 
    304         $self->debug and warn "$char\n"; 
    305         if ( is_ascii($char) ) { 
    306             $self->debug and warn "$char is_ascii\n"; 
    307             $newbuf .= $char; 
    308         } 
    309         elsif ( exists $win1252{$char} ) { 
    310             $self->debug and warn "$char is win1252\n"; 
    311             $newbuf .= $win1252{$char}; 
    312         } 
    313         else { 
    314             $self->debug and warn "append $char\n"; 
    315             $newbuf .= $char; 
    316         } 
    317  
    318     } 
    319  
     265    my $newbuf = looks_like_win1252($buf) ? fix_latin($buf) : $buf; 
    320266    return $self->convert($newbuf); 
    321267} 
  • Search-Tools/trunk/lib/Search/Tools/UTF8.pm

    r2411 r2414  
    2020    byte_length 
    2121    looks_like_win1252 
    22     debug_bytes_in_string 
     22    debug_bytes 
    2323); 
    2424 
     
    134134    if (   !is_latin1( $_[0] ) 
    135135        && !is_ascii( $_[0] ) 
    136         && $_[0] =~ m/[\x80-\x9f]/ ) 
     136        && $_[0] =~ m/[\x00-\x7f]?[\x80-\x9f][\x00-\x7f]?/ ) 
    137137    { 
    138138        return 1; 
     
    264264See also the Search::Tools::Transliterate convert1252() method. 
    265265 
    266 =head2 debug_bytes_in_string( I<text> ) 
     266=head2 debug_bytes( I<text> ) 
    267267 
    268268Iterates over each byte in I<text>, printing byte, hex and decimal values 
  • Search-Tools/trunk/t/20transliterate-map.t

    r2411 r2414  
    4141 
    4242#$tr->debug(1); 
    43 my $win1252 = "\xcf\x80\x82\x83\x91\x92\x93\x94\x9f"; 
    44 my $utf8_not_1252 = to_utf8("\xcf");  # \xc3\x8f 
    45 ok( looks_like_win1252($win1252), "looks_like_win1252" ); 
    46 ok( !looks_like_win1252($utf8_not_1252), "utf8 string !looks_like_win1252"); 
     43my $win1252 = "a\x{80}b\x{82}c\x{83}d\x{91}e\x{92}f\x{93}g"; 
     44my $utf8_not_1252 = to_utf8("\xcf");    # \xc3\x8f 
     45ok( looks_like_win1252($win1252),        "looks_like_win1252" ); 
     46ok( !looks_like_win1252($utf8_not_1252), "utf8 string !looks_like_win1252" ); 
    4747ok( my $win1252_conv = $tr->convert1252($win1252), "convert1252" ); 
    4848 
    49 #debug_bytes_in_string($win1252); 
    50 #debug_bytes_in_string($utf8_not_1252); 
     49#diag("win1252"); 
     50#debug_bytes($win1252_conv); 
     51 
     52#debug_bytes($utf8_not_1252); 
    5153 
    5254#diag( dump $win1252_conv ); 
    53 is( $win1252_conv, qq{IEUR'f''""Y}, "transliterate 1252" ); 
     55is( $win1252_conv, qq{aEURb'cfd'e'f"g}, "transliterate 1252" ); 
    5456 
    55571; 
  • Search-Tools/trunk/t/28-byte-length.t

    r2410 r2414  
    3030 
    3131diag("astr: $astr"); 
    32 debug_bytes_in_string($astr); 
     32debug_bytes($astr); 
    3333 
    3434diag("bstr: $bstr"); 
    35 debug_bytes_in_string($bstr); 
     35debug_bytes($bstr); 
Note: See TracChangeset for help on using the changeset viewer.