Changeset 2586
- Timestamp:
- 03/07/10 00:21:39 (2 years ago)
- Location:
- Search-Query-Dialect-KSx/trunk
- Files:
-
- 7 edited
-
Changes (modified) (1 diff)
-
lib/Search/Query/Dialect/KSx.pm (modified) (1 diff)
-
lib/Search/Query/Dialect/KSx/Compiler.pm (modified) (8 diffs)
-
lib/Search/Query/Dialect/KSx/NOTWildcardQuery.pm (modified) (1 diff)
-
lib/Search/Query/Dialect/KSx/Scorer.pm (modified) (4 diffs)
-
lib/Search/Query/Dialect/KSx/WildcardQuery.pm (modified) (1 diff)
-
lib/Search/Query/Field/KSx.pm (modified) (1 diff)
Legend:
- Unmodified
- Added
- Removed
-
Search-Query-Dialect-KSx/trunk/Changes
r2581 r2586 17 17 or Parser 18 18 19 0.05 xxx 20 * refactor scoring via Compiler and Scorer. -
Search-Query-Dialect-KSx/trunk/lib/Search/Query/Dialect/KSx.pm
r2581 r2586 17 17 use Search::Query::Dialect::KSx::WildcardQuery; 18 18 19 our $VERSION = '0.0 4';19 our $VERSION = '0.05'; 20 20 21 21 __PACKAGE__->mk_accessors( -
Search-Query-Dialect-KSx/trunk/lib/Search/Query/Dialect/KSx/Compiler.pm
r2581 r2586 7 7 use Data::Dump qw( dump ); 8 8 9 our $VERSION = '0.0 4';9 our $VERSION = '0.05'; 10 10 11 11 # inside out vars 12 my %include; 13 my ( %idf, %raw_impact, %terms, %query_norm_factor, %normalized_impact, ); 12 my (%include, %searchable, %idf, 13 %raw_impact, %lex_terms, %doc_freq, 14 %query_norm_factor, %normalized_impact, %term_freq 15 ); 14 16 15 17 =head1 NAME … … 35 37 36 38 sub new { 37 my $class = shift; 38 my %args = @_; 39 my $include = delete $args{include} || 0; 40 my $self = $class->SUPER::new(%args); 41 $include{$$self} = $include; 39 my $class = shift; 40 my %args = @_; 41 my $include = delete $args{include} || 0; 42 my $searchable = $args{searchable} or croak "searchable required"; 43 my $self = $class->SUPER::new(%args); 44 $include{$$self} = $include; 45 $searchable{$$self} = $searchable; 42 46 return $self; 43 47 } … … 51 55 sub make_matcher { 52 56 my ( $self, %args ) = @_; 57 53 58 my $seg_reader = $args{reader}; 59 my $searchable = $searchable{$$self}; 54 60 55 61 # Retrieve low-level components LexiconReader and PostingListReader. … … 59 65 60 66 # Acquire a Lexicon and seek it to our query string. 61 my $term = $self->get_parent->get_term; 62 my $regex = $self->get_parent->get_regex; 63 my $field = $self->get_parent->get_field; 64 my $prefix = $self->get_parent->get_prefix; 67 my $parent = $self->get_parent; 68 my $term = $parent->get_term; 69 my $regex = $parent->get_regex; 70 my $field = $parent->get_field; 71 my $prefix = $parent->get_prefix; 65 72 my $lexicon = $lex_reader->lexicon( field => $field ); 66 73 return unless $lexicon; 74 75 # Retrieve the correct Similarity for the Query's field. 76 my $sim = $args{similarity} = $searchable->get_schema->fetch_sim($field); 77 67 78 $lexicon->seek( defined $prefix ? $prefix : '' ); 68 79 69 80 # Accumulate PostingLists for each matching term. 70 81 my @posting_lists; 82 my @lex_terms; 71 83 my $include = $include{$$self}; 72 84 while ( defined( my $lex_term = $lexicon->get_term ) ) { … … 90 102 if ($posting_list) { 91 103 push @posting_lists, $posting_list; 104 push @lex_terms, $lex_term; 92 105 } 93 106 last unless $lexicon->next; … … 95 108 return unless @posting_lists; 96 109 110 $doc_freq{$$self} = scalar(@posting_lists); 111 $lex_terms{$$self} = \@lex_terms; 112 97 113 #carp dump \@posting_lists; 114 115 # Calculate and store the IDF 116 my $max_doc = $searchable->doc_max; 117 my $idf = $idf{$$self} 118 = $max_doc 119 ? $searchable->get_schema->fetch_type($field)->get_boost 120 + log( $max_doc / ( 1 + $doc_freq{$$self} ) ) 121 : $searchable->get_schema->fetch_type($field)->get_boost; 122 123 $raw_impact{$$self} = $idf * $parent->get_boost; 124 125 #carp "raw_impact{$$self}= $raw_impact{$$self}"; 126 127 # make final preparations 128 $self->_perform_query_normalization($searchable); 98 129 99 130 return Search::Query::Dialect::KSx::Scorer->new( … … 103 134 } 104 135 105 # TODO decipher this 106 #sub perform_query_normalization { 107 # 108 # # copied from KinoSearch::Search::Weight originally 109 # my ( $self, $searcher ) = @_; 110 # my $sim = $self->get_similarity; 111 # 112 # my $factor = $self->sum_of_squared_weights; # factor = ( tf_q * idf_t ) 113 # $factor = $sim->query_norm($factor); # factor /= norm_q 114 # $self->normalize($factor); # impact *= factor 115 #} 136 =head2 get_searchable 137 138 Returns the Searchable object for this Compiler. 139 140 =cut 141 142 sub get_searchable { 143 my $self = shift; 144 return $searchable{$$self}; 145 } 146 147 =head2 get_doc_freq 148 149 Returns the document frequency for this Compiler. 150 151 =cut 152 153 sub get_doc_freq { 154 my $self = shift; 155 return $doc_freq{$$self}; 156 } 157 158 =head2 get_lex_terms 159 160 Returns array ref of the terms in the lexicon that matched. 161 162 =cut 163 164 sub get_lex_terms { 165 my $self = shift; 166 return $lex_terms{$$self}; 167 } 168 169 sub _perform_query_normalization { 170 171 # copied from KinoSearch::Search::Weight originally 172 my ( $self, $searcher ) = @_; 173 my $sim = $self->get_similarity; 174 my $factor = $self->sum_of_squared_weights; # factor = ( tf_q * idf_t ) 175 $factor = $sim->query_norm($factor); # factor /= norm_q 176 $self->normalize($factor); # impact *= factor 177 178 #carp "normalize factor=$factor"; 179 } 180 181 =head2 apply_norm_factor( I<factor> ) 182 183 Overrides base class. Currently just passes I<factor> on to parent method. 184 185 =cut 186 187 sub apply_norm_factor { 188 my ( $self, $factor ) = @_; 189 190 #carp "apply_norm_factor=$factor"; 191 192 $self->SUPER::apply_norm_factor($factor); 193 } 116 194 117 195 =head2 get_boost … … 123 201 sub get_boost { shift->get_parent->get_boost } 124 202 125 # TODO decipher this 126 #sub sum_of_squared_weights { my $self = shift; $raw_impact{$$self}**2 } 127 128 # TODO decipher this 129 #sub normalize { # copied from TermQuery 130 # my ( $self, $query_norm_factor ) = @_; 131 # $query_norm_factor{$$self} = $query_norm_factor; 132 # 133 # # Multiply raw impact by ( tf_q * idf_q / norm_q ) 134 # # 135 # # Note: factoring in IDF a second time is correct. See formula. 136 # $normalized_impact{$$self} 137 # = $raw_impact{$$self} * $idf{$$self} * $query_norm_factor; 138 #} 203 =head2 sum_of_squared_weights 204 205 Returns imact of term on score. 206 207 =cut 208 209 sub sum_of_squared_weights { 210 211 #carp "sum_of_squared_weights"; 212 my $self = shift; 213 return exists $raw_impact{$$self} ? $raw_impact{$$self}**2 : '1.0'; 214 } 215 216 =head2 normalize() 217 218 Affects the score of the term. See KinoSearch::Search::Compiler. 219 220 =cut 221 222 sub normalize { # copied from TermQuery 223 my ( $self, $query_norm_factor ) = @_; 224 $query_norm_factor{$$self} = $query_norm_factor; 225 226 # Multiply raw impact by ( tf_q * idf_q / norm_q ) 227 # 228 # Note: factoring in IDF a second time is correct. See formula. 229 $normalized_impact{$$self} 230 = $raw_impact{$$self} * $idf{$$self} * $query_norm_factor; 231 232 #carp "normalized_impact{$$self} = $normalized_impact{$$self}"; 233 return $normalized_impact{$$self}; 234 } 139 235 140 236 1; -
Search-Query-Dialect-KSx/trunk/lib/Search/Query/Dialect/KSx/NOTWildcardQuery.pm
r2581 r2586 5 5 use Carp; 6 6 7 our $VERSION = '0.0 4';7 our $VERSION = '0.05'; 8 8 9 9 =head1 NAME -
Search-Query-Dialect-KSx/trunk/lib/Search/Query/Dialect/KSx/Scorer.pm
r2581 r2586 5 5 use Carp; 6 6 7 our $VERSION = '0.0 4';7 our $VERSION = '0.05'; 8 8 9 9 # Inside-out member vars. … … 43 43 for my $posting_list (@$posting_lists) { 44 44 while ( my $doc_id = $posting_list->next ) { 45 $hits{$doc_id} += $posting_list->get_doc_freq; 46 47 # TODO tf*weight ?? 45 my $posting = $posting_list->get_posting; 46 $hits{$doc_id} += $posting->get_freq; 48 47 } 49 48 } … … 52 51 $doc_ids{$$self} = [ sort { $a <=> $b } keys %hits ]; 53 52 $term_freqs{$$self} = \%hits; 54 55 $pos{$$self} = -1; 56 $boosts{$$self} = $compiler->get_boost; 53 $pos{$$self} = -1; 54 $boosts{$$self} = $compiler->get_boost; 57 55 58 56 return $self; … … 98 96 my $doc_id = $$doc_ids[$pos]; 99 97 my $term_freq = $term_freqs{$$self}->{$doc_id}; 100 return $boost * $sim{$$self}->tf($term_freq); 98 99 #carp "doc_id=$doc_id term_freq=$term_freq boost=$boost"; 100 return ( $boost * $sim{$$self}->tf($term_freq) ) / 10; 101 101 } 102 102 -
Search-Query-Dialect-KSx/trunk/lib/Search/Query/Dialect/KSx/WildcardQuery.pm
r2581 r2586 7 7 use Search::Query::Dialect::KSx::Compiler; 8 8 9 our $VERSION = '0.0 4';9 our $VERSION = '0.05'; 10 10 11 11 =head1 NAME -
Search-Query-Dialect-KSx/trunk/lib/Search/Query/Field/KSx.pm
r2581 r2586 7 7 __PACKAGE__->mk_accessors(qw( type is_int analyzer )); 8 8 9 our $VERSION = '0.0 4';9 our $VERSION = '0.05'; 10 10 11 11 =head1 NAME
Note: See TracChangeset
for help on using the changeset viewer.