Changeset 2557


Ignore:
Timestamp:
03/01/10 00:22:13 (2 years ago)
Author:
karpet
Message:

add in refactored code from RegexpTermQuery?

Location:
Search-Query-Dialect-KSx/trunk/lib/Search/Query/Dialect/KSx
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • Search-Query-Dialect-KSx/trunk/lib/Search/Query/Dialect/KSx/Compiler.pm

    r2556 r2557  
    55use Carp; 
    66use Search::Query::Dialect::KSx::Scorer; 
     7use Data::Dump qw( dump ); 
    78 
    89our $VERSION = '0.01'; 
     
    1011# inside out vars 
    1112my %include; 
     13my ( %idf, %raw_impact, %terms, %query_norm_factor, %normalized_impact, ); 
    1214 
    1315=head1 NAME 
     
    1719=head1 SYNOPSIS 
    1820 
    19  # see KinoSearch::Search::Compiler 
     21    # see KinoSearch::Search::Compiler 
    2022 
    2123=head1 METHODS 
    2224 
    23 This class isa KinoSearch::Search::Compiler subclass. 
    24 Only new or overridden methods are documented. 
     25This class isa KinoSearch::Search::Compiler subclass . Only new 
     26or overridden methods are documented . 
    2527 
    2628=cut 
     
    5759 
    5860    # Acquire a Lexicon and seek it to our query string. 
    59     my $substring = $self->get_parent->get_query_string; 
    60     $substring =~ s/\*.\s*$//; 
    61     my $field = $self->get_parent->get_field; 
     61    my $term    = $self->get_parent->get_term; 
     62    my $regex   = $self->get_parent->get_regex; 
     63    my $field   = $self->get_parent->get_field; 
     64    my $prefix  = $self->get_parent->get_prefix; 
    6265    my $lexicon = $lex_reader->lexicon( field => $field ); 
    6366    return unless $lexicon; 
    64     $lexicon->seek($substring); 
     67    $lexicon->seek( defined $prefix ? $prefix : '' ); 
    6568 
    6669    # Accumulate PostingLists for each matching term. 
    6770    my @posting_lists; 
    6871    my $include = $include{$$self}; 
    69     while ( defined( my $term = $lexicon->get_term ) ) { 
     72    while ( defined( my $lex_term = $lexicon->get_term ) ) { 
     73 
     74        # weed out non-matchers early. 
     75        last if defined $prefix and index( $lex_term, $prefix ) != 0; 
     76 
     77        #carp "$term field:$field: term>$lex_term<"; 
    7078        if ($include) { 
    71             last unless $term =~ m/^\Q$substring/; 
     79            next unless $lex_term =~ $regex; 
    7280        } 
    7381        else { 
    74             last if $term =~ m/^\Q$substring/; 
     82            last if $lex_term =~ $regex; 
    7583        } 
    7684        my $posting_list = $plist_reader->posting_list( 
    7785            field => $field, 
    78             term  => $term, 
     86            term  => $lex_term, 
    7987        ); 
     88 
     89        #carp "check posting_list"; 
    8090        if ($posting_list) { 
    8191            push @posting_lists, $posting_list; 
     
    8595    return unless @posting_lists; 
    8696 
     97    #carp dump \@posting_lists; 
     98 
    8799    return Search::Query::Dialect::KSx::Scorer->new( 
    88         posting_lists => \@posting_lists ); 
     100        posting_lists => \@posting_lists, 
     101        compiler      => $self, 
     102    ); 
    89103} 
     104 
     105# TODO decipher this 
     106#sub perform_query_normalization { 
     107# 
     108#    # copied from KinoSearch::Search::Weight originally 
     109#    my ( $self, $searcher ) = @_; 
     110#    my $sim = $self->get_similarity; 
     111# 
     112#    my $factor = $self->sum_of_squared_weights;    # factor = ( tf_q * idf_t ) 
     113#    $factor = $sim->query_norm($factor);           # factor /= norm_q 
     114#    $self->normalize($factor);                     # impact *= factor 
     115#} 
     116 
     117=head2 get_boost 
     118 
     119Returns the boost for the parent Query object. 
     120 
     121=cut 
     122 
     123sub get_boost { shift->get_parent->get_boost } 
     124 
     125# TODO decipher this 
     126#sub sum_of_squared_weights { my $self = shift; $raw_impact{$$self}**2 } 
     127 
     128# TODO decipher this 
     129#sub normalize {                                    # copied from TermQuery 
     130#    my ( $self, $query_norm_factor ) = @_; 
     131#    $query_norm_factor{$$self} = $query_norm_factor; 
     132# 
     133#    # Multiply raw impact by ( tf_q * idf_q / norm_q ) 
     134#    # 
     135#    # Note: factoring in IDF a second time is correct.  See formula. 
     136#    $normalized_impact{$$self} 
     137#        = $raw_impact{$$self} * $idf{$$self} * $query_norm_factor; 
     138#} 
    90139 
    911401; 
  • Search-Query-Dialect-KSx/trunk/lib/Search/Query/Dialect/KSx/Scorer.pm

    r2556 r2557  
    88 
    99# Inside-out member vars. 
    10 my %doc_ids; 
    11 my %tally; 
    12 my %tick; 
     10my ( %doc_ids, %pos, %boosts, %sim, %term_freqs ); 
    1311 
    1412=head1 NAME 
     
    3533sub new { 
    3634    my ( $class, %args ) = @_; 
     35 
     36    my $compiler      = delete $args{compiler}; 
     37    my $reader        = delete $args{reader}; 
     38    my $need_score    = delete $args{need_score}; 
    3739    my $posting_lists = delete $args{posting_lists}; 
    3840    my $self          = $class->SUPER::new(%args); 
    3941 
    40     # Cheesy but simple way of interleaving PostingList doc sets. 
    41     my %all_doc_ids; 
     42    my %hits;    # The keys are the doc nums; the values the tfs. 
    4243    for my $posting_list (@$posting_lists) { 
    4344        while ( my $doc_id = $posting_list->next ) { 
    44             $all_doc_ids{$doc_id} = undef; 
     45            $hits{$doc_id} += $posting_list->get_doc_freq; 
     46 
     47            # TODO tf*weight ?? 
    4548        } 
    4649    } 
    47     my @doc_ids = sort { $a <=> $b } keys %all_doc_ids; 
    48     $doc_ids{$$self} = \@doc_ids; 
    4950 
    50     $tick{$$self}  = -1; 
    51     $tally{$$self} = KinoSearch::Search::Tally->new; 
    52     $tally{$$self}->set_score(1.0);    # fixed score of 1.0 
     51    $sim{$$self}        = $compiler->get_similarity; 
     52    $doc_ids{$$self}    = [ sort { $a <=> $b } keys %hits ]; 
     53    $term_freqs{$$self} = \%hits; 
     54 
     55    $pos{$$self}    = -1; 
     56    $boosts{$$self} = $compiler->get_boost; 
    5357 
    5458    return $self; 
    5559} 
    5660 
    57 sub DESTROY { 
    58     my $self = shift; 
    59     delete $doc_ids{$$self}; 
    60     delete $tick{$$self}; 
    61     delete $tally{$$self}; 
    62     $self->SUPER::DESTROY; 
    63 } 
    64  
    6561=head2 next 
    6662 
    67 Returns the next doc_id or 0. 
     63Returns the next doc_id. 
    6864 
    6965=cut 
     
    7268    my $self    = shift; 
    7369    my $doc_ids = $doc_ids{$$self}; 
    74     my $tick    = ++$tick{$$self}; 
    75     return 0 if $tick >= scalar @$doc_ids; 
    76     return $doc_ids->[$tick]; 
     70    return 0 if $pos{$$self} >= $#$doc_ids; 
     71    return $doc_ids->[ ++$pos{$$self} ]; 
    7772} 
    7873 
    7974=head2 get_doc_id 
    8075 
    81 Returns a doc_id. 
     76Returns the doc_id for the current position. 
    8277 
    8378=cut 
     
    8580sub get_doc_id { 
    8681    my $self    = shift; 
    87     my $tick    = $tick{$$self}; 
     82    my $pos     = $pos{$$self}; 
    8883    my $doc_ids = $doc_ids{$$self}; 
    89     return $tick < scalar @$doc_ids ? $doc_ids->[$tick] : 0; 
     84    return $pos < scalar @$doc_ids ? $$doc_ids[$pos] : 0; 
    9085} 
    9186 
    92 =head2 tally 
     87=head2 score 
    9388 
    94 Returns the tally for the Scorer (a KinoSearch::Search::Tally object). 
     89Returns the score of the hit. 
    9590 
    9691=cut 
    9792 
    98 sub tally { 
    99     my $self = shift; 
    100     return $tally{$$self}; 
     93sub score { 
     94    my $self      = shift; 
     95    my $pos       = $pos{$$self}; 
     96    my $doc_ids   = $doc_ids{$$self}; 
     97    my $boost     = $boosts{$$self}; 
     98    my $doc_id    = $$doc_ids[$pos]; 
     99    my $term_freq = $term_freqs{$$self}->{$doc_id}; 
     100    return $boost * $sim{$$self}->tf($term_freq); 
    101101} 
    102102 
  • Search-Query-Dialect-KSx/trunk/lib/Search/Query/Dialect/KSx/WildcardQuery.pm

    r2556 r2557  
    3434=cut 
    3535 
    36 # Inside-out member vars and hand-rolled accessors. 
     36# Inside-out member vars 
    3737my %term; 
    3838my %field; 
     39my %regex; 
     40my %prefix; 
    3941 
    4042=head2 new( I<args> ) 
     
    5860    $term{$$self}  = $term; 
    5961    $field{$$self} = $field; 
     62    $self->_build_regex($term); 
    6063    return $self; 
    6164} 
    6265 
     66sub _build_regex { 
     67    my ( $self, $term ) = @_; 
     68    $term = quotemeta($term);  # turn into a regexp that matches a literal str 
     69    $term =~ s/\\\*/.*/g;          # convert wildcards into regex 
     70    $term =~ s/\\\?/./g;           # convert wildcards into regex 
     71    $term =~ s/(?:\.\*){2,}/.*/g;  # eliminate multiple consecutive wild cards 
     72    $term =~ s/^/^/ unless $term =~ s/^\.\*//;    # anchor the regexp to 
     73    $term =~ s/\z/\\z/ unless $term =~ s/\.\*\z//;    # the ends of the term 
     74    $regex{$$self} = qr/$term/; 
     75 
     76    # get the literal prefix of the regexp, if any. 
     77    if ($regex{$$self} =~ m<^ 
     78            (?:    # prefix for qr//'s, without allowing /i : 
     79                \(\? ([a-hj-z]*) (?:-[a-z]*)?: 
     80            )? 
     81            (\\[GA]|\^) # anchor 
     82            ([^#\$()*+.?[\]\\^]+) # literal pat (no metachars or comments) 
     83        >x 
     84        ) 
     85    { 
     86        { 
     87            my ( $mod, $anchor, $prefix ) = ( $1 || '', $2, $3 ); 
     88            $anchor eq '^' and $mod =~ /m/ and last; 
     89            for ($prefix) { 
     90                $mod =~ /x/ and s/\s+//g; 
     91            } 
     92            $prefix{$$self} = $prefix; 
     93        } 
     94    } 
     95 
     96} 
     97 
    6398=head2 get_term 
    6499 
     
    67102Retrieve the value set in new(). 
    68103 
    69 =cut 
    70  
    71 sub get_term  { my $self = shift; return $term{$$self} } 
    72 sub get_field { my $self = shift; return $field{$$self} } 
     104=head2 get_regex 
     105 
     106Retrieve the qr// object representing I<term>. 
     107 
     108=head2 get_prefix 
     109 
     110Retrieve the literal string (if any) that prefixes the wildcards 
     111in I<term>. 
     112 
     113=cut 
     114 
     115sub get_term   { my $self = shift; return $term{$$self} } 
     116sub get_field  { my $self = shift; return $field{$$self} } 
     117sub get_regex  { my $self = shift; return $regex{$$self} } 
     118sub get_prefix { my $self = shift; return $prefix{$$self} } 
    73119 
    74120sub DESTROY { 
Note: See TracChangeset for help on using the changeset viewer.