#!/usr/bin/perl -w # Preempt::query::parser - "generic" search query parser # This module parses query terms handed to it in a string. # # Essentially, a "query" value is broken into parts. Each part is # a quoted series of "words", or otherwise single "words". These "words" # are any series of characters, separated by spaces (as opposed to English # words, or any other such abstract concept ;-). Each one is taken # in turn. # # If the word is a boolean (in capitals), AND, OR or NOT, then it is # used as the "op" between words. If a word is preceeded with a # +, | or - then that too is used as the op between words (respectively). # Incidentally, two words separated by a + (eg. the+cat) are treated as # if they were quoted. # # Words can also be attribute selectors. For example, a word "intitle:banana" # would mean "look for the word banana in the title". Other selectors are # "site:", "intext:", "filetype:" and "url:". These cause attriutes to be # defined for the search term (which is stripped of it's attibute selector). # Booleans prepended to attribute selectors are honoured. # # Ultimately, a triplet of lists is created (and returned in a hashref). The # actual return code of the routine is the number of real search words found. # "real search words" are those not used in attribute selectors (although # "intext:" is an exception, and considered a real search word). Thus, if # a search of "site:www.banana.com" was requested, no actual search need be # performed because the search terms don't make sense (because they limit # a search, but don't specify what to search for). # # The three lists returned all correspond to each other. The first contains # actual search terms. The second the boolean operators involved and the # third any attribute selections (of which there may be more than one). # The last search term in the list has no boolean associated with it - # thus, "term1 boolean1 term2" is the way the logic works. All terms # have (at least) default attribute selectors (of "ALL"). # # In theory, this routine could parse any kind of search query and for it # to be used with any search middle or backend. In truth, it's all designed # for Zap/Zebra, but might not take too much to mangle into other forms if # needs be. # # Usage: # # my $user_query = "france NOT paris"; # my %query_result=(); # my $count = Preempt::query::parser::parse($user_query, \%query_result); # if($count > 0) { # print "The search was for " . $query_result{'search_terms_html'} . "\n"; # } else { # print "The search terms did not contain any searchable words\n"; # } package Preempt::query::parser; use strict; use warnings; use HTML::Entities; use constant DEFAULT_OP => 'and'; use constant DEFAULT_ATTR => 'ALL'; # query = the search query (as provided by the user) # outputref = a hash reference that will get populated with query information # stopword_cb = an optional reference to a call back function that determines # if words are stopwords or not sub parse { my ($query,$outputref,$stopword_cb,$synonym_cb)=@_; my $key; my @words=(); my @ops=(); my @attribs=(); my @search_limiters=(); my @redundant_terms=(); my @stopwords=(); # Break the string into "words" (meaning parts, as opposed to # real words). The regex is complex - don't ask me to explain ;-) # Know that it returns the correct chunks, in and amongst some # undefs. Get rid of the undefs, and it's all good ;-) my @matches=($query=~/(\"[^\"]*\")|(\S+)/g); my $part; my $holding=""; my $holding_attr=""; foreach $part (@matches) { # Skip undefs caused by the regex splitter next if(!defined($part)); # Remove the quotes around parts, if they have them... $part=~s/"//g; # If a terms is 'worda+wordb' then this should be the same # as putting it in double quotes (as opposed to 'worda +wordb', # which is different) $part=~s/([^\s]+)\+([^\s]+)/$1 $2/g; # All terms have default attributes unless we find out otherwise my $attr=DEFAULT_ATTR; # Okay, see if the part is actually an op... if(($part eq "OR") || ($part eq "AND") || ($part eq "NOT")) { if($part eq "AND") { push @redundant_terms, $part; } # It's an op, so not a real search word if($holding ne "") { # Have a word in holding, so add it to the # list, with this as the op my $lpart=$part; $lpart=~tr/A-Z/a-z/; push @words, $holding; push @ops, $lpart; push @attribs, $holding_attr; $holding=""; $holding_attr=""; } else { # If no word in holding, we actually drop this op. # It makes no sense to have "NOT worda wordb" because # that means "NOT worda AND wordb". It should be # written as "wordb NOT worda" if that's what you want. push @redundant_terms, $part; } } else { # It's not an op per-se, but might be included # in the word itself (also works on attribute selectors) my $op=""; if($part=~s/^-//) { $op="not"; } elsif($part=~s/^\+//) { $op="and"; } elsif($part=~s/^\|//) { $op="or"; } # It might also have attibutes attached... if($part=~s/^site:(.*)$/$1/i) { unless($part =~/^\s*$/) { $attr="SITE"; push @search_limiters, "site:" . $part; } } elsif($part=~s/^intitle:(.*)$/$1/i) { $attr="ALL+TITLE"; } elsif($part=~s/^inurl:(.*)$/$1/i) { $attr="ALL+URL"; } elsif($part=~s/^mimetype:(.*)$/$1/i) { unless($part =~/^\s*$/) { $attr="MIME"; push @search_limiters, "mime:" . $part; } } elsif($part=~s/^filetype:(.*)$/$1/i) { unless($part =~/^\s*$/) { $attr="FILEEXT"; push @search_limiters, "filetype:" . $part; } } elsif($part=~s/^intext:(.*)$/$1/i) { $attr="ALL+BODY"; } # If we got an empty qualifier, then this isn't actually a # search term next if($part =~ /^\s*$/); # Okay, it might have been an prepended op, or # an attriute selector, or perhaps just a plain old # search term. If we have one in holding, then stick # it on the pile with the op we've just figured out. if($holding ne "") { # also have a word in holding, so push # that onto the list with the default op. push @words, $holding; $op=DEFAULT_OP if($op eq ""); push @ops, $op; push @attribs, $holding_attr; } my $do_stopwords=1; # Now put our new word into holding... # (along with it's corresponding attribute selector) # This is now Stop Word aware, by means of an optional callback # function. if(($do_stopwords) && (defined($stopword_cb))) { # Callback defined, so use it... if(&{$stopword_cb}($part)) { # callback returned true - drop this word #print STDERR "Parse: dropping stopword $part (attr=$attr)\n"; push @stopwords, $part; $holding=""; $holding_attr=""; } else { # callback returned false - use this word $holding=$part; $holding_attr=$attr; } } else { # No callback, so we always accept all words... $holding=$part; $holding_attr=$attr; } } } # Okay, end of the loop. Just make sure we don't have anything # in holding. Remember, no "op" required, because this is the # last term. if($holding ne "") { push @words, $holding; push @attribs, $holding_attr; } # Now build up the various query info strings... my $real_search_terms=""; my $real_search_terms_html=""; # Encoded entities version my $real_search_terms_full=""; my $real_search_terms_full_html=""; my $search_terms_stripped=""; my $search_terms_stripped_html=""; my $search_terms_full_stripped=""; my $search_terms_full_stripped_html=""; my %synonyms_inserted=(); my %synonyms_inserted_html=(); my $real_search_terms_count=0; my $i=0; for($i=0; $i<=$#words; $i++) { # If the current word has attributes that include # SITE or MIME, then this word is a search limiter, # otherwise it's a real search term. unless(($attribs[$i] =~ /MIME/) || ($attribs[$i] =~ /SITE/) || ($attribs[$i] =~ /FILEEXT/)) { if(defined($synonym_cb)) { my $synonym=&{$synonym_cb}($words[$i]); if(defined($synonym)) { # We have a synonym for this word, so we have to # slip it in amongst the other words. unless(grep(/^$synonym$/i,@words)) { # array, offset, length, list my $offset=$i+1; $offset=$#words if($offset>$#words); splice(@words,$offset,0,($synonym)); splice(@attribs,$offset,0,('ALL')); splice(@ops,$i,0,('or')); my $word=$words[$i]; # Make a note of this insertion, so long as it's not done already... unless(grep(/^$word$/i,keys %synonyms_inserted)) { $synonyms_inserted{$word}=$synonym; $synonyms_inserted_html{$word}=encode_entities($synonym); } } } } # Real search term... $real_search_terms_count++; $real_search_terms.="$words[$i] "; $real_search_terms_html.='' . encode_entities($words[$i]) . ' '; $real_search_terms_full.="$words[$i] "; $real_search_terms_full_html.='' . encode_entities($words[$i]) . ' '; $search_terms_stripped.=$words[$i] . ' '; $search_terms_stripped_html.=encode_entities($words[$i]) . ' '; $search_terms_full_stripped.=$words[$i] . ' '; $search_terms_full_stripped_html.=encode_entities($words[$i]) . ' '; if(defined($ops[$i])) { my $uop=$ops[$i]; $uop=~tr/a-z/A-Z/; $real_search_terms.="$uop " if($uop ne "AND"); $real_search_terms_full.="$uop "; $search_terms_full_stripped.="$uop "; $search_terms_full_stripped_html.="$uop "; } } } $real_search_terms=~s/\s+$//; $real_search_terms_full=~s/\s+$//; $search_terms_full_stripped=~s/\s+$//; $search_terms_full_stripped_html=~s/\s+$//; # uniq the search_limiters and redundant_terms lists... my @temp; # reuse $holding... my @search_limiters_html=(); foreach $holding (@search_limiters) { unless(grep(/^$holding$/i,@temp)) { push @temp, $holding; push @search_limiters_html, encode_entities($holding); } } @search_limiters=@temp; @temp=(); my @redundant_terms_html=(); foreach $holding (@redundant_terms) { unless(grep(/^$holding$/i,@temp)) { push @temp, $holding; push @redundant_terms_html,encode_entities($holding); } } @redundant_terms=@temp; my @stopwords_html=(); if(defined($stopword_cb)) { @temp=(); foreach $holding (@stopwords) { unless(grep(/^$holding$/i,@temp)) { push @temp, $holding; push @stopwords_html,encode_entities($holding); } } @stopwords=@temp; } # Populate the output hash reference $$outputref{'words'}=\@words; $$outputref{'ops'}=\@ops; $$outputref{'attribs'}=\@attribs; $$outputref{'limiters'}=\@search_limiters; $$outputref{'limiters_html'}=\@search_limiters_html; $$outputref{'redundant'}=\@redundant_terms; $$outputref{'redundant_html'}=\@redundant_terms_html; $$outputref{'stopwords'}=\@stopwords; $$outputref{'stopwords_html'}=\@stopwords_html; $$outputref{'synonyms'}=\%synonyms_inserted; $$outputref{'synonyms_html'}=\%synonyms_inserted_html; $$outputref{'search_terms'}=$real_search_terms; $$outputref{'search_terms_html'}=$real_search_terms_html; $$outputref{'search_terms_full'}=$real_search_terms_full; $$outputref{'search_terms_full_html'}=$real_search_terms_full_html; $$outputref{'search_terms_stripped'}=$real_search_terms; $$outputref{'search_terms_stripped'}=~s/<[^>]*>//g; $$outputref{'search_terms_stripped_html'}=$search_terms_stripped_html; $$outputref{'search_terms_full_stripped'}=$real_search_terms_full; $$outputref{'search_terms_full_stripped'}=~s/<[^>]*>//g; $$outputref{'search_terms_full_stripped_html'}=$search_terms_full_stripped_html; # Add the terms we dropped from the count so that we perform an empty search # and so can report the things we dropped, rather than silently ignoring them $real_search_terms_count=$real_search_terms_count + $#redundant_terms + @stopwords + 2; return $real_search_terms_count; } 1;