Changeset 2586


Ignore:
Timestamp:
03/07/10 00:21:39 (2 years ago)
Author:
karpet
Message:

refactor scoring

Location:
Search-Query-Dialect-KSx/trunk
Files:
7 edited

Legend:

Unmodified
Added
Removed
  • Search-Query-Dialect-KSx/trunk/Changes

    r2581 r2586  
    1717          or Parser 
    1818 
     190.05    xxx 
     20        * refactor scoring via Compiler and Scorer. 
  • Search-Query-Dialect-KSx/trunk/lib/Search/Query/Dialect/KSx.pm

    r2581 r2586  
    1717use Search::Query::Dialect::KSx::WildcardQuery; 
    1818 
    19 our $VERSION = '0.04'; 
     19our $VERSION = '0.05'; 
    2020 
    2121__PACKAGE__->mk_accessors( 
  • Search-Query-Dialect-KSx/trunk/lib/Search/Query/Dialect/KSx/Compiler.pm

    r2581 r2586  
    77use Data::Dump qw( dump ); 
    88 
    9 our $VERSION = '0.04'; 
     9our $VERSION = '0.05'; 
    1010 
    1111# inside out vars 
    12 my %include; 
    13 my ( %idf, %raw_impact, %terms, %query_norm_factor, %normalized_impact, ); 
     12my (%include,           %searchable,        %idf, 
     13    %raw_impact,        %lex_terms,         %doc_freq, 
     14    %query_norm_factor, %normalized_impact, %term_freq 
     15); 
    1416 
    1517=head1 NAME 
     
    3537 
    3638sub new { 
    37     my $class   = shift; 
    38     my %args    = @_; 
    39     my $include = delete $args{include} || 0; 
    40     my $self    = $class->SUPER::new(%args); 
    41     $include{$$self} = $include; 
     39    my $class      = shift; 
     40    my %args       = @_; 
     41    my $include    = delete $args{include} || 0; 
     42    my $searchable = $args{searchable} or croak "searchable required"; 
     43    my $self       = $class->SUPER::new(%args); 
     44    $include{$$self}    = $include; 
     45    $searchable{$$self} = $searchable; 
    4246    return $self; 
    4347} 
     
    5155sub make_matcher { 
    5256    my ( $self, %args ) = @_; 
     57 
    5358    my $seg_reader = $args{reader}; 
     59    my $searchable = $searchable{$$self}; 
    5460 
    5561    # Retrieve low-level components LexiconReader and PostingListReader. 
     
    5965 
    6066    # Acquire a Lexicon and seek it to our query string. 
    61     my $term    = $self->get_parent->get_term; 
    62     my $regex   = $self->get_parent->get_regex; 
    63     my $field   = $self->get_parent->get_field; 
    64     my $prefix  = $self->get_parent->get_prefix; 
     67    my $parent  = $self->get_parent; 
     68    my $term    = $parent->get_term; 
     69    my $regex   = $parent->get_regex; 
     70    my $field   = $parent->get_field; 
     71    my $prefix  = $parent->get_prefix; 
    6572    my $lexicon = $lex_reader->lexicon( field => $field ); 
    6673    return unless $lexicon; 
     74 
     75    # Retrieve the correct Similarity for the Query's field. 
     76    my $sim = $args{similarity} = $searchable->get_schema->fetch_sim($field); 
     77 
    6778    $lexicon->seek( defined $prefix ? $prefix : '' ); 
    6879 
    6980    # Accumulate PostingLists for each matching term. 
    7081    my @posting_lists; 
     82    my @lex_terms; 
    7183    my $include = $include{$$self}; 
    7284    while ( defined( my $lex_term = $lexicon->get_term ) ) { 
     
    90102        if ($posting_list) { 
    91103            push @posting_lists, $posting_list; 
     104            push @lex_terms,     $lex_term; 
    92105        } 
    93106        last unless $lexicon->next; 
     
    95108    return unless @posting_lists; 
    96109 
     110    $doc_freq{$$self}  = scalar(@posting_lists); 
     111    $lex_terms{$$self} = \@lex_terms; 
     112 
    97113    #carp dump \@posting_lists; 
     114 
     115    # Calculate and store the IDF 
     116    my $max_doc = $searchable->doc_max; 
     117    my $idf     = $idf{$$self} 
     118        = $max_doc 
     119        ? $searchable->get_schema->fetch_type($field)->get_boost 
     120        + log( $max_doc / ( 1 + $doc_freq{$$self} ) ) 
     121        : $searchable->get_schema->fetch_type($field)->get_boost; 
     122 
     123    $raw_impact{$$self} = $idf * $parent->get_boost; 
     124 
     125    #carp "raw_impact{$$self}= $raw_impact{$$self}"; 
     126 
     127    # make final preparations 
     128    $self->_perform_query_normalization($searchable); 
    98129 
    99130    return Search::Query::Dialect::KSx::Scorer->new( 
     
    103134} 
    104135 
    105 # TODO decipher this 
    106 #sub perform_query_normalization { 
    107 # 
    108 #    # copied from KinoSearch::Search::Weight originally 
    109 #    my ( $self, $searcher ) = @_; 
    110 #    my $sim = $self->get_similarity; 
    111 # 
    112 #    my $factor = $self->sum_of_squared_weights;    # factor = ( tf_q * idf_t ) 
    113 #    $factor = $sim->query_norm($factor);           # factor /= norm_q 
    114 #    $self->normalize($factor);                     # impact *= factor 
    115 #} 
     136=head2 get_searchable 
     137 
     138Returns the Searchable object for this Compiler. 
     139 
     140=cut 
     141 
     142sub get_searchable { 
     143    my $self = shift; 
     144    return $searchable{$$self}; 
     145} 
     146 
     147=head2 get_doc_freq 
     148 
     149Returns the document frequency for this Compiler. 
     150 
     151=cut 
     152 
     153sub get_doc_freq { 
     154    my $self = shift; 
     155    return $doc_freq{$$self}; 
     156} 
     157 
     158=head2 get_lex_terms 
     159 
     160Returns array ref of the terms in the lexicon that matched. 
     161 
     162=cut 
     163 
     164sub get_lex_terms { 
     165    my $self = shift; 
     166    return $lex_terms{$$self}; 
     167} 
     168 
     169sub _perform_query_normalization { 
     170 
     171    # copied from KinoSearch::Search::Weight originally 
     172    my ( $self, $searcher ) = @_; 
     173    my $sim    = $self->get_similarity; 
     174    my $factor = $self->sum_of_squared_weights;    # factor = ( tf_q * idf_t ) 
     175    $factor = $sim->query_norm($factor);           # factor /= norm_q 
     176    $self->normalize($factor);                     # impact *= factor 
     177 
     178    #carp "normalize factor=$factor"; 
     179} 
     180 
     181=head2 apply_norm_factor( I<factor> ) 
     182 
     183Overrides base class. Currently just passes I<factor> on to parent method. 
     184 
     185=cut 
     186 
     187sub apply_norm_factor { 
     188    my ( $self, $factor ) = @_; 
     189 
     190    #carp "apply_norm_factor=$factor"; 
     191 
     192    $self->SUPER::apply_norm_factor($factor); 
     193} 
    116194 
    117195=head2 get_boost 
     
    123201sub get_boost { shift->get_parent->get_boost } 
    124202 
    125 # TODO decipher this 
    126 #sub sum_of_squared_weights { my $self = shift; $raw_impact{$$self}**2 } 
    127  
    128 # TODO decipher this 
    129 #sub normalize {                                    # copied from TermQuery 
    130 #    my ( $self, $query_norm_factor ) = @_; 
    131 #    $query_norm_factor{$$self} = $query_norm_factor; 
    132 # 
    133 #    # Multiply raw impact by ( tf_q * idf_q / norm_q ) 
    134 #    # 
    135 #    # Note: factoring in IDF a second time is correct.  See formula. 
    136 #    $normalized_impact{$$self} 
    137 #        = $raw_impact{$$self} * $idf{$$self} * $query_norm_factor; 
    138 #} 
     203=head2 sum_of_squared_weights 
     204 
     205Returns imact of term on score. 
     206 
     207=cut 
     208 
     209sub sum_of_squared_weights { 
     210 
     211    #carp "sum_of_squared_weights"; 
     212    my $self = shift; 
     213    return exists $raw_impact{$$self} ? $raw_impact{$$self}**2 : '1.0'; 
     214} 
     215 
     216=head2 normalize() 
     217 
     218Affects the score of the term. See KinoSearch::Search::Compiler. 
     219 
     220=cut 
     221 
     222sub normalize {    # copied from TermQuery 
     223    my ( $self, $query_norm_factor ) = @_; 
     224    $query_norm_factor{$$self} = $query_norm_factor; 
     225 
     226    # Multiply raw impact by ( tf_q * idf_q / norm_q ) 
     227    # 
     228    # Note: factoring in IDF a second time is correct.  See formula. 
     229    $normalized_impact{$$self} 
     230        = $raw_impact{$$self} * $idf{$$self} * $query_norm_factor; 
     231 
     232    #carp "normalized_impact{$$self} = $normalized_impact{$$self}"; 
     233    return $normalized_impact{$$self}; 
     234} 
    139235 
    1402361; 
  • Search-Query-Dialect-KSx/trunk/lib/Search/Query/Dialect/KSx/NOTWildcardQuery.pm

    r2581 r2586  
    55use Carp; 
    66 
    7 our $VERSION = '0.04'; 
     7our $VERSION = '0.05'; 
    88 
    99=head1 NAME 
  • Search-Query-Dialect-KSx/trunk/lib/Search/Query/Dialect/KSx/Scorer.pm

    r2581 r2586  
    55use Carp; 
    66 
    7 our $VERSION = '0.04'; 
     7our $VERSION = '0.05'; 
    88 
    99# Inside-out member vars. 
     
    4343    for my $posting_list (@$posting_lists) { 
    4444        while ( my $doc_id = $posting_list->next ) { 
    45             $hits{$doc_id} += $posting_list->get_doc_freq; 
    46  
    47             # TODO tf*weight ?? 
     45            my $posting = $posting_list->get_posting; 
     46            $hits{$doc_id} += $posting->get_freq; 
    4847        } 
    4948    } 
     
    5251    $doc_ids{$$self}    = [ sort { $a <=> $b } keys %hits ]; 
    5352    $term_freqs{$$self} = \%hits; 
    54  
    55     $pos{$$self}    = -1; 
    56     $boosts{$$self} = $compiler->get_boost; 
     53    $pos{$$self}        = -1; 
     54    $boosts{$$self}     = $compiler->get_boost; 
    5755 
    5856    return $self; 
     
    9896    my $doc_id    = $$doc_ids[$pos]; 
    9997    my $term_freq = $term_freqs{$$self}->{$doc_id}; 
    100     return $boost * $sim{$$self}->tf($term_freq); 
     98 
     99    #carp "doc_id=$doc_id  term_freq=$term_freq  boost=$boost"; 
     100    return ( $boost * $sim{$$self}->tf($term_freq) ) / 10; 
    101101} 
    102102 
  • Search-Query-Dialect-KSx/trunk/lib/Search/Query/Dialect/KSx/WildcardQuery.pm

    r2581 r2586  
    77use Search::Query::Dialect::KSx::Compiler; 
    88 
    9 our $VERSION = '0.04'; 
     9our $VERSION = '0.05'; 
    1010 
    1111=head1 NAME 
  • Search-Query-Dialect-KSx/trunk/lib/Search/Query/Field/KSx.pm

    r2581 r2586  
    77__PACKAGE__->mk_accessors(qw( type is_int analyzer )); 
    88 
    9 our $VERSION = '0.04'; 
     9our $VERSION = '0.05'; 
    1010 
    1111=head1 NAME 
Note: See TracChangeset for help on using the changeset viewer.