Changeset 2557
- Timestamp:
- 03/01/10 00:22:13 (2 years ago)
- Location:
- Search-Query-Dialect-KSx/trunk/lib/Search/Query/Dialect/KSx
- Files:
-
- 3 edited
-
Compiler.pm (modified) (5 diffs)
-
Scorer.pm (modified) (4 diffs)
-
WildcardQuery.pm (modified) (3 diffs)
Legend:
- Unmodified
- Added
- Removed
-
Search-Query-Dialect-KSx/trunk/lib/Search/Query/Dialect/KSx/Compiler.pm
r2556 r2557 5 5 use Carp; 6 6 use Search::Query::Dialect::KSx::Scorer; 7 use Data::Dump qw( dump ); 7 8 8 9 our $VERSION = '0.01'; … … 10 11 # inside out vars 11 12 my %include; 13 my ( %idf, %raw_impact, %terms, %query_norm_factor, %normalized_impact, ); 12 14 13 15 =head1 NAME … … 17 19 =head1 SYNOPSIS 18 20 19 # see KinoSearch::Search::Compiler21 # see KinoSearch::Search::Compiler 20 22 21 23 =head1 METHODS 22 24 23 This class isa KinoSearch::Search::Compiler subclass .24 Only new or overridden methods are documented.25 This class isa KinoSearch::Search::Compiler subclass . Only new 26 or overridden methods are documented . 25 27 26 28 =cut … … 57 59 58 60 # Acquire a Lexicon and seek it to our query string. 59 my $substring = $self->get_parent->get_query_string; 60 $substring =~ s/\*.\s*$//; 61 my $field = $self->get_parent->get_field; 61 my $term = $self->get_parent->get_term; 62 my $regex = $self->get_parent->get_regex; 63 my $field = $self->get_parent->get_field; 64 my $prefix = $self->get_parent->get_prefix; 62 65 my $lexicon = $lex_reader->lexicon( field => $field ); 63 66 return unless $lexicon; 64 $lexicon->seek( $substring);67 $lexicon->seek( defined $prefix ? $prefix : '' ); 65 68 66 69 # Accumulate PostingLists for each matching term. 67 70 my @posting_lists; 68 71 my $include = $include{$$self}; 69 while ( defined( my $term = $lexicon->get_term ) ) { 72 while ( defined( my $lex_term = $lexicon->get_term ) ) { 73 74 # weed out non-matchers early. 75 last if defined $prefix and index( $lex_term, $prefix ) != 0; 76 77 #carp "$term field:$field: term>$lex_term<"; 70 78 if ($include) { 71 last unless $term =~ m/^\Q$substring/;79 next unless $lex_term =~ $regex; 72 80 } 73 81 else { 74 last if $ term =~ m/^\Q$substring/;82 last if $lex_term =~ $regex; 75 83 } 76 84 my $posting_list = $plist_reader->posting_list( 77 85 field => $field, 78 term => $ term,86 term => $lex_term, 79 87 ); 88 89 #carp "check posting_list"; 80 90 if ($posting_list) { 81 91 push @posting_lists, $posting_list; … … 85 95 return unless @posting_lists; 86 96 97 #carp dump \@posting_lists; 98 87 99 return Search::Query::Dialect::KSx::Scorer->new( 88 posting_lists => \@posting_lists ); 100 posting_lists => \@posting_lists, 101 compiler => $self, 102 ); 89 103 } 104 105 # TODO decipher this 106 #sub perform_query_normalization { 107 # 108 # # copied from KinoSearch::Search::Weight originally 109 # my ( $self, $searcher ) = @_; 110 # my $sim = $self->get_similarity; 111 # 112 # my $factor = $self->sum_of_squared_weights; # factor = ( tf_q * idf_t ) 113 # $factor = $sim->query_norm($factor); # factor /= norm_q 114 # $self->normalize($factor); # impact *= factor 115 #} 116 117 =head2 get_boost 118 119 Returns the boost for the parent Query object. 120 121 =cut 122 123 sub get_boost { shift->get_parent->get_boost } 124 125 # TODO decipher this 126 #sub sum_of_squared_weights { my $self = shift; $raw_impact{$$self}**2 } 127 128 # TODO decipher this 129 #sub normalize { # copied from TermQuery 130 # my ( $self, $query_norm_factor ) = @_; 131 # $query_norm_factor{$$self} = $query_norm_factor; 132 # 133 # # Multiply raw impact by ( tf_q * idf_q / norm_q ) 134 # # 135 # # Note: factoring in IDF a second time is correct. See formula. 136 # $normalized_impact{$$self} 137 # = $raw_impact{$$self} * $idf{$$self} * $query_norm_factor; 138 #} 90 139 91 140 1; -
Search-Query-Dialect-KSx/trunk/lib/Search/Query/Dialect/KSx/Scorer.pm
r2556 r2557 8 8 9 9 # Inside-out member vars. 10 my %doc_ids; 11 my %tally; 12 my %tick; 10 my ( %doc_ids, %pos, %boosts, %sim, %term_freqs ); 13 11 14 12 =head1 NAME … … 35 33 sub new { 36 34 my ( $class, %args ) = @_; 35 36 my $compiler = delete $args{compiler}; 37 my $reader = delete $args{reader}; 38 my $need_score = delete $args{need_score}; 37 39 my $posting_lists = delete $args{posting_lists}; 38 40 my $self = $class->SUPER::new(%args); 39 41 40 # Cheesy but simple way of interleaving PostingList doc sets. 41 my %all_doc_ids; 42 my %hits; # The keys are the doc nums; the values the tfs. 42 43 for my $posting_list (@$posting_lists) { 43 44 while ( my $doc_id = $posting_list->next ) { 44 $all_doc_ids{$doc_id} = undef; 45 $hits{$doc_id} += $posting_list->get_doc_freq; 46 47 # TODO tf*weight ?? 45 48 } 46 49 } 47 my @doc_ids = sort { $a <=> $b } keys %all_doc_ids;48 $doc_ids{$$self} = \@doc_ids;49 50 50 $tick{$$self} = -1; 51 $tally{$$self} = KinoSearch::Search::Tally->new; 52 $tally{$$self}->set_score(1.0); # fixed score of 1.0 51 $sim{$$self} = $compiler->get_similarity; 52 $doc_ids{$$self} = [ sort { $a <=> $b } keys %hits ]; 53 $term_freqs{$$self} = \%hits; 54 55 $pos{$$self} = -1; 56 $boosts{$$self} = $compiler->get_boost; 53 57 54 58 return $self; 55 59 } 56 60 57 sub DESTROY {58 my $self = shift;59 delete $doc_ids{$$self};60 delete $tick{$$self};61 delete $tally{$$self};62 $self->SUPER::DESTROY;63 }64 65 61 =head2 next 66 62 67 Returns the next doc_id or 0.63 Returns the next doc_id. 68 64 69 65 =cut … … 72 68 my $self = shift; 73 69 my $doc_ids = $doc_ids{$$self}; 74 my $tick = ++$tick{$$self}; 75 return 0 if $tick >= scalar @$doc_ids; 76 return $doc_ids->[$tick]; 70 return 0 if $pos{$$self} >= $#$doc_ids; 71 return $doc_ids->[ ++$pos{$$self} ]; 77 72 } 78 73 79 74 =head2 get_doc_id 80 75 81 Returns a doc_id.76 Returns the doc_id for the current position. 82 77 83 78 =cut … … 85 80 sub get_doc_id { 86 81 my $self = shift; 87 my $ tick = $tick{$$self};82 my $pos = $pos{$$self}; 88 83 my $doc_ids = $doc_ids{$$self}; 89 return $ tick < scalar @$doc_ids ? $doc_ids->[$tick] : 0;84 return $pos < scalar @$doc_ids ? $$doc_ids[$pos] : 0; 90 85 } 91 86 92 =head2 tally87 =head2 score 93 88 94 Returns the tally for the Scorer (a KinoSearch::Search::Tally object).89 Returns the score of the hit. 95 90 96 91 =cut 97 92 98 sub tally { 99 my $self = shift; 100 return $tally{$$self}; 93 sub score { 94 my $self = shift; 95 my $pos = $pos{$$self}; 96 my $doc_ids = $doc_ids{$$self}; 97 my $boost = $boosts{$$self}; 98 my $doc_id = $$doc_ids[$pos]; 99 my $term_freq = $term_freqs{$$self}->{$doc_id}; 100 return $boost * $sim{$$self}->tf($term_freq); 101 101 } 102 102 -
Search-Query-Dialect-KSx/trunk/lib/Search/Query/Dialect/KSx/WildcardQuery.pm
r2556 r2557 34 34 =cut 35 35 36 # Inside-out member vars and hand-rolled accessors.36 # Inside-out member vars 37 37 my %term; 38 38 my %field; 39 my %regex; 40 my %prefix; 39 41 40 42 =head2 new( I<args> ) … … 58 60 $term{$$self} = $term; 59 61 $field{$$self} = $field; 62 $self->_build_regex($term); 60 63 return $self; 61 64 } 62 65 66 sub _build_regex { 67 my ( $self, $term ) = @_; 68 $term = quotemeta($term); # turn into a regexp that matches a literal str 69 $term =~ s/\\\*/.*/g; # convert wildcards into regex 70 $term =~ s/\\\?/./g; # convert wildcards into regex 71 $term =~ s/(?:\.\*){2,}/.*/g; # eliminate multiple consecutive wild cards 72 $term =~ s/^/^/ unless $term =~ s/^\.\*//; # anchor the regexp to 73 $term =~ s/\z/\\z/ unless $term =~ s/\.\*\z//; # the ends of the term 74 $regex{$$self} = qr/$term/; 75 76 # get the literal prefix of the regexp, if any. 77 if ($regex{$$self} =~ m<^ 78 (?: # prefix for qr//'s, without allowing /i : 79 \(\? ([a-hj-z]*) (?:-[a-z]*)?: 80 )? 81 (\\[GA]|\^) # anchor 82 ([^#\$()*+.?[\]\\^]+) # literal pat (no metachars or comments) 83 >x 84 ) 85 { 86 { 87 my ( $mod, $anchor, $prefix ) = ( $1 || '', $2, $3 ); 88 $anchor eq '^' and $mod =~ /m/ and last; 89 for ($prefix) { 90 $mod =~ /x/ and s/\s+//g; 91 } 92 $prefix{$$self} = $prefix; 93 } 94 } 95 96 } 97 63 98 =head2 get_term 64 99 … … 67 102 Retrieve the value set in new(). 68 103 69 =cut 70 71 sub get_term { my $self = shift; return $term{$$self} } 72 sub get_field { my $self = shift; return $field{$$self} } 104 =head2 get_regex 105 106 Retrieve the qr// object representing I<term>. 107 108 =head2 get_prefix 109 110 Retrieve the literal string (if any) that prefixes the wildcards 111 in I<term>. 112 113 =cut 114 115 sub get_term { my $self = shift; return $term{$$self} } 116 sub get_field { my $self = shift; return $field{$$self} } 117 sub get_regex { my $self = shift; return $regex{$$self} } 118 sub get_prefix { my $self = shift; return $prefix{$$self} } 73 119 74 120 sub DESTROY {
Note: See TracChangeset
for help on using the changeset viewer.