From 1c092ee9111dafe5cecd50edc4a1e5c6181f0d64 Mon Sep 17 00:00:00 2001 From: Phyks Date: Sun, 18 May 2014 22:12:49 +0200 Subject: [PATCH] Beginning of a search query parser --- search.py | 293 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 293 insertions(+) create mode 100644 search.py diff --git a/search.py b/search.py new file mode 100644 index 0000000..01a06f0 --- /dev/null +++ b/search.py @@ -0,0 +1,293 @@ +"""Search query parser + +Modified by Phyks, 2014-05-18. Original source code is here: +http://pyparsing.wikispaces.com/file/view/searchparser.py/30112816/searchparser.py + +version 2006-03-09 + +This search query parser uses the excellent Pyparsing module +(http://pyparsing.sourceforge.net/) to parse search queries by users. +It handles: + +* 'and', 'or' and implicit 'and' operators; +* parentheses; +* quoted strings; +* wildcards at the end of a search term (help*); + +Requirements: +* Python +* Pyparsing + +If you run this script, it will perform a number of tests. To use is as a +module, you should use inheritance on the SearchQueryParser class and overwrite +the Get... methods. The ParserTest class gives a very simple example of how this +could work. + +------------------------------------------------------------------------------- +Copyright (c) 2006, Estrate, the Netherlands +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. +* Neither the name of Estrate nor the names of its contributors may be used + to endorse or promote products derived from this software without specific + prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +CONTRIBUTORS: +- Steven Mooij +- Rudolph Froger +- Paul McGuire +""" +from pyparsing import Word, alphanums, Keyword, Group, Combine, Forward +from pyparsing import Suppress, Optional, OneOrMore, oneOf, Or + +class SearchQueryParser: + + def __init__(self): + self._methods = { + 'and': self.evaluateAnd, + 'or': self.evaluateOr, + 'not': self.evaluateNot, + 'parenthesis': self.evaluateParenthesis, + 'quotes': self.evaluateQuotes, + 'word': self.evaluateWord, + 'wordwildcard': self.evaluateWordWildcard, + } + self._parser = self.parser() + + def parser(self): + """ + This function returns a parser. + The grammar should be like most full text search engines (Google, Tsearch, Lucene). + + Grammar: + - a query consists of alphanumeric words, with an optional '*' wildcard + at the end of a word + - a sequence of words between quotes is a literal string + - words can be used together by using operators ('and' or 'or') + - words with operators can be grouped with parenthesis + - a word or group of words can be preceded by a 'not' operator + - the 'and' operator precedes an 'or' operator + - if an operator is missing, use an 'and' operator + """ + operatorOr = Forward() + + operatorWord = Group(Combine(Word(alphanums) + Suppress('*'))).setResultsName('wordwildcard') | \ + Group(Word(alphanums)).setResultsName('word') + + operatorQuotesContent = Forward() + operatorQuotesContent << ( + (operatorWord + operatorQuotesContent) | operatorWord + ) + + operatorQuotes = Group( + Or([Suppress('"') + operatorQuotesContent + Suppress('"'), + Suppress('\'') + operatorQuotesContent + Suppress('\'')] + )).setResultsName("quotes") | operatorWord + + operatorParenthesis = Group( + (Suppress("(") + operatorOr + Suppress(")")) + ).setResultsName("parenthesis") | operatorQuotes + + operatorNot = Forward() + operatorNot << (Group( + Suppress(Keyword("not", caseless=True)) + operatorNot + ).setResultsName("not") | operatorParenthesis) + + operatorAnd = Forward() + operatorAnd << (Group( + operatorNot + Suppress(Keyword("and", caseless=True)) + operatorAnd + ).setResultsName("and") | Group( + operatorNot + OneOrMore(~oneOf("and or") + operatorAnd) + ).setResultsName("and") | operatorNot) + + operatorOr << (Group( + operatorAnd + Suppress(Keyword("or", caseless=True)) + operatorOr + ).setResultsName("or") | operatorAnd) + + return operatorOr.parseString + + def evaluateAnd(self, argument): + return self.evaluate(argument[0]).intersection(self.evaluate(argument[1])) + + def evaluateOr(self, argument): + return self.evaluate(argument[0]).union(self.evaluate(argument[1])) + + def evaluateNot(self, argument): + return self.GetNot(self.evaluate(argument[0])) + + def evaluateParenthesis(self, argument): + return self.evaluate(argument[0]) + + def evaluateQuotes(self, argument): + """Evaluate quoted strings + + First is does an 'and' on the indidual search terms, then it asks the + function GetQuoted to only return the subset of ID's that contain the + literal string. + """ + r = set() + search_terms = [] + for item in argument: + search_terms.append(item[0]) + if len(r) == 0: + r = self.evaluate(item) + else: + r = r.intersection(self.evaluate(item)) + return self.GetQuotes(' '.join(search_terms), r) + + def evaluateWord(self, argument): + return self.GetWord(argument[0]) + + def evaluateWordWildcard(self, argument): + return self.GetWordWildcard(argument[0]) + + def evaluate(self, argument): + return self._methods[argument.getName()](argument) + + def Parse(self, query): + #print self._parser(query)[0] + return self.evaluate(self._parser(query)[0]) + + def GetWord(self, word): + return set() + + def GetWordWildcard(self, word): + return set() + + def GetQuotes(self, search_string, tmp_result): + return set() + + def GetNot(self, not_set): + return set().difference(not_set) + + +class ParserTest(SearchQueryParser): + """Tests the parser with some search queries + tests containts a dictionary with tests and expected results. + """ + tests = { + 'help': set([1, 2, 4, 5]), + 'help or hulp': set([1, 2, 3, 4, 5]), + 'help and hulp': set([2]), + 'help hulp': set([2]), + 'help and hulp or hilp': set([2, 3, 4]), + 'help or hulp and hilp': set([1, 2, 3, 4, 5]), + 'help or hulp or hilp or halp': set([1, 2, 3, 4, 5, 6]), + '(help or hulp) and (hilp or halp)': set([3, 4, 5]), + 'help and (hilp or halp)': set([4, 5]), + '(help and (hilp or halp)) or hulp': set([2, 3, 4, 5]), + 'not help': set([3, 6, 7, 8]), + 'not hulp and halp': set([5, 6]), + 'not (help and halp)': set([1, 2, 3, 4, 6, 7, 8]), + '"help me please"': set([2]), + '"help me please" or hulp': set([2, 3]), + '"help me please" or (hulp and halp)': set([2]), + 'help*': set([1, 2, 4, 5, 8]), + 'help or hulp*': set([1, 2, 3, 4, 5]), + 'help* and hulp': set([2]), + 'help and hulp* or hilp': set([2, 3, 4]), + 'help* or hulp or hilp or halp': set([1, 2, 3, 4, 5, 6, 8]), + '(help or hulp*) and (hilp* or halp)': set([3, 4, 5]), + 'help* and (hilp* or halp*)': set([4, 5]), + '(help and (hilp* or halp)) or hulp*': set([2, 3, 4, 5]), + 'not help* and halp': set([6]), + 'not (help* and helpe*)': set([1, 2, 3, 4, 5, 6, 7]), + '"help* me please"': set([2]), + '"help* me* please" or hulp*': set([2, 3]), + '"help me please*" or (hulp and halp)': set([2]), + '"help me please" not (hulp and halp)': set([2]), + '"help me please" hulp': set([2]), + '\'help me please\' hulp': set([2]), + 'help and hilp and not holp': set([4]), + 'help hilp not holp': set([4]), + 'help hilp and not holp': set([4]), + } + + docs = { + 1: 'help', + 2: 'help me please hulp', + 3: 'hulp hilp', + 4: 'help hilp', + 5: 'halp thinks he needs help', + 6: 'he needs halp', + 7: 'nothing', + 8: 'helper', + } + + index = { + 'help': set((1, 2, 4, 5)), + 'me': set((2,)), + 'please': set((2,)), + 'hulp': set((2, 3,)), + 'hilp': set((3, 4,)), + 'halp': set((5, 6,)), + 'thinks': set((5,)), + 'he': set((5, 6,)), + 'needs': set((5, 6,)), + 'nothing': set((7,)), + 'helper': set((8,)), + } + + def GetWord(self, word): + if (self.index.has_key(word)): + return self.index[word] + else: + return set() + + def GetWordWildcard(self, word): + result = set() + for item in self.index.keys(): + if word == item[0:len(word)]: + result = result.union(self.index[item]) + return result + + def GetQuotes(self, search_string, tmp_result): + result = set() + for item in tmp_result: + if self.docs[item].count(search_string): + result.add(item) + return result + + def GetNot(self, not_set): + all = set(self.docs.keys()) + return all.difference(not_set) + + def Test(self): + all_ok = True + for item in self.tests.keys(): + print item + r = self.Parse(item) + e = self.tests[item] + print 'Result: %s' % r + print 'Expect: %s' % e + if e == r: + print 'Test OK' + else: + all_ok = False + print '>>>>>>>>>>>>>>>>>>>>>>Test ERROR<<<<<<<<<<<<<<<<<<<<<' + print '' + return all_ok + +if __name__=='__main__': + if ParserTest().Test(): + print 'All tests OK' + else: + print 'One or more tests FAILED'