Beginning of a search query parser

2014-05-18 22:12:49 +02:00 · 2014-05-18 22:12:49 +02:00 · 1c092ee911
commit 1c092ee911
parent d81443385b
1 changed files with 293 additions and 0 deletions
--- a/search.py
+++ b/search.py
@ -0,0 +1,293 @@
 """Search query parser
 Modified by Phyks, 2014-05-18. Original source code is here:
 http://pyparsing.wikispaces.com/file/view/searchparser.py/30112816/searchparser.py
 version 2006-03-09
 This search query parser uses the excellent Pyparsing module 
 (http://pyparsing.sourceforge.net/) to parse search queries by users.
 It handles:
 * 'and', 'or' and implicit 'and' operators;
 * parentheses;
 * quoted strings;
 * wildcards at the end of a search term (help*);
 Requirements:
 * Python
 * Pyparsing
 If you run this script, it will perform a number of tests. To use is as a
 module, you should use inheritance on the SearchQueryParser class and overwrite
 the Get... methods. The ParserTest class gives a very simple example of how this
 could work.
 -------------------------------------------------------------------------------
 Copyright (c) 2006, Estrate, the Netherlands
 All rights reserved.
 Redistribution and use in source and binary forms, with or without modification,
 are permitted provided that the following conditions are met:
 * Redistributions of source code must retain the above copyright notice, this
  list of conditions and the following disclaimer.
 * Redistributions in binary form must reproduce the above copyright notice,
  this list of conditions and the following disclaimer in the documentation 
  and/or other materials provided with the distribution.
 * Neither the name of Estrate nor the names of its contributors may be used
  to endorse or promote products derived from this software without specific
  prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 
 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 
 ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 
 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 CONTRIBUTORS:
 - Steven Mooij
 - Rudolph Froger
 - Paul McGuire
 """
 from pyparsing import Word, alphanums, Keyword, Group, Combine, Forward
 from pyparsing import Suppress, Optional, OneOrMore, oneOf, Or
 class SearchQueryParser:
    def __init__(self):
        self._methods = {
            'and': self.evaluateAnd,
            'or': self.evaluateOr,
            'not': self.evaluateNot,
            'parenthesis': self.evaluateParenthesis,
            'quotes': self.evaluateQuotes,
            'word': self.evaluateWord,
            'wordwildcard': self.evaluateWordWildcard,
        }
        self._parser = self.parser()
    def parser(self):
        """
        This function returns a parser.
        The grammar should be like most full text search engines (Google, Tsearch, Lucene).
        Grammar:
        - a query consists of alphanumeric words, with an optional '*' wildcard
          at the end of a word
        - a sequence of words between quotes is a literal string
        - words can be used together by using operators ('and' or 'or')
        - words with operators can be grouped with parenthesis
        - a word or group of words can be preceded by a 'not' operator
        - the 'and' operator precedes an 'or' operator
        - if an operator is missing, use an 'and' operator
        """
        operatorOr = Forward()
        operatorWord = Group(Combine(Word(alphanums) + Suppress('*'))).setResultsName('wordwildcard') | \
                            Group(Word(alphanums)).setResultsName('word')
        operatorQuotesContent = Forward()
        operatorQuotesContent << (
            (operatorWord + operatorQuotesContent) | operatorWord
        )
        operatorQuotes = Group(
            Or([Suppress('"') + operatorQuotesContent + Suppress('"'),
                Suppress('\'') + operatorQuotesContent + Suppress('\'')]
        )).setResultsName("quotes") | operatorWord
        operatorParenthesis = Group(
            (Suppress("(") + operatorOr + Suppress(")"))
        ).setResultsName("parenthesis") | operatorQuotes
        operatorNot = Forward()
        operatorNot << (Group(
            Suppress(Keyword("not", caseless=True)) + operatorNot
        ).setResultsName("not") | operatorParenthesis)
        operatorAnd = Forward()
        operatorAnd << (Group(
            operatorNot + Suppress(Keyword("and", caseless=True)) + operatorAnd
        ).setResultsName("and") | Group(
            operatorNot + OneOrMore(~oneOf("and or") + operatorAnd)
        ).setResultsName("and") | operatorNot)
        operatorOr << (Group(
            operatorAnd + Suppress(Keyword("or", caseless=True)) + operatorOr
        ).setResultsName("or") | operatorAnd)
        return operatorOr.parseString
    def evaluateAnd(self, argument):
        return self.evaluate(argument[0]).intersection(self.evaluate(argument[1]))
    def evaluateOr(self, argument):
        return self.evaluate(argument[0]).union(self.evaluate(argument[1]))
    def evaluateNot(self, argument):
        return self.GetNot(self.evaluate(argument[0]))
    def evaluateParenthesis(self, argument):
        return self.evaluate(argument[0])
    def evaluateQuotes(self, argument):
        """Evaluate quoted strings
        First is does an 'and' on the indidual search terms, then it asks the
        function GetQuoted to only return the subset of ID's that contain the
        literal string.
        """
        r = set()
        search_terms = []
        for item in argument:
            search_terms.append(item[0])
            if len(r) == 0:
                r = self.evaluate(item)
            else:
                r = r.intersection(self.evaluate(item))
        return self.GetQuotes(' '.join(search_terms), r)
    def evaluateWord(self, argument):
        return self.GetWord(argument[0])
    def evaluateWordWildcard(self, argument):
        return self.GetWordWildcard(argument[0])
    def evaluate(self, argument):
        return self._methods[argument.getName()](argument)
    def Parse(self, query):
        #print self._parser(query)[0]
        return self.evaluate(self._parser(query)[0])
    def GetWord(self, word):
        return set()
    def GetWordWildcard(self, word):
        return set()
    def GetQuotes(self, search_string, tmp_result):
        return set()
    def GetNot(self, not_set):
        return set().difference(not_set)
 class ParserTest(SearchQueryParser):
    """Tests the parser with some search queries
    tests containts a dictionary with tests and expected results.
    """
    tests = {
        'help': set([1, 2, 4, 5]),
        'help or hulp': set([1, 2, 3, 4, 5]),
        'help and hulp': set([2]),
        'help hulp': set([2]),
        'help and hulp or hilp': set([2, 3, 4]),
        'help or hulp and hilp': set([1, 2, 3, 4, 5]),
        'help or hulp or hilp or halp': set([1, 2, 3, 4, 5, 6]),
        '(help or hulp) and (hilp or halp)': set([3, 4, 5]),
        'help and (hilp or halp)': set([4, 5]),
        '(help and (hilp or halp)) or hulp': set([2, 3, 4, 5]),
        'not help': set([3, 6, 7, 8]),
        'not hulp and halp': set([5, 6]),
        'not (help and halp)': set([1, 2, 3, 4, 6, 7, 8]),
        '"help me please"': set([2]),
        '"help me please" or hulp': set([2, 3]),
        '"help me please" or (hulp and halp)': set([2]),
        'help*': set([1, 2, 4, 5, 8]),
        'help or hulp*': set([1, 2, 3, 4, 5]),
        'help* and hulp': set([2]),
        'help and hulp* or hilp': set([2, 3, 4]),
        'help* or hulp or hilp or halp': set([1, 2, 3, 4, 5, 6, 8]),
        '(help or hulp*) and (hilp* or halp)': set([3, 4, 5]),
        'help* and (hilp* or halp*)': set([4, 5]),
        '(help and (hilp* or halp)) or hulp*': set([2, 3, 4, 5]),
        'not help* and halp': set([6]),
        'not (help* and helpe*)': set([1, 2, 3, 4, 5, 6, 7]),
        '"help* me please"': set([2]),
        '"help* me* please" or hulp*': set([2, 3]),
        '"help me please*" or (hulp and halp)': set([2]),
        '"help me please" not (hulp and halp)': set([2]),
        '"help me please" hulp': set([2]),
        '\'help me please\' hulp': set([2]),
        'help and hilp and not holp': set([4]),
        'help hilp not holp': set([4]),
        'help hilp and not holp': set([4]),
    }
    docs = {
        1: 'help',
        2: 'help me please hulp',
        3: 'hulp hilp',
        4: 'help hilp',
        5: 'halp thinks he needs help',
        6: 'he needs halp',
        7: 'nothing',
        8: 'helper',
    }
    index = {
        'help': set((1, 2, 4, 5)),
        'me': set((2,)),
        'please': set((2,)),
        'hulp': set((2, 3,)),
        'hilp': set((3, 4,)),
        'halp': set((5, 6,)),
        'thinks': set((5,)),
        'he': set((5, 6,)),
        'needs': set((5, 6,)),
        'nothing': set((7,)),
        'helper': set((8,)),
    }
    def GetWord(self, word):
        if (self.index.has_key(word)):
            return self.index[word]
        else:
            return set()
    def GetWordWildcard(self, word):
        result = set()
        for item in self.index.keys():
            if word == item[0:len(word)]:
                result = result.union(self.index[item])
        return result
    def GetQuotes(self, search_string, tmp_result):
        result = set()
        for item in tmp_result:
            if self.docs[item].count(search_string):
                result.add(item)
        return result
    def GetNot(self, not_set):
        all = set(self.docs.keys())
        return all.difference(not_set)
    def Test(self):
        all_ok = True
        for item in self.tests.keys():
            print item
            r = self.Parse(item)
            e = self.tests[item]
            print 'Result: %s' % r
            print 'Expect: %s' % e
            if e == r:
                print 'Test OK'
            else:
                all_ok = False
                print '>>>>>>>>>>>>>>>>>>>>>>Test ERROR<<<<<<<<<<<<<<<<<<<<<'
            print ''
        return all_ok
 if __name__=='__main__':
    if ParserTest().Test():
        print 'All tests OK'
    else:
        print 'One or more tests FAILED'