294 lines
11 KiB
Python
294 lines
11 KiB
Python
"""Search query parser
|
|
|
|
Modified by Phyks, 2014-05-18. Original source code is here:
|
|
http://pyparsing.wikispaces.com/file/view/searchparser.py/30112816/searchparser.py
|
|
|
|
version 2006-03-09
|
|
|
|
This search query parser uses the excellent Pyparsing module
|
|
(http://pyparsing.sourceforge.net/) to parse search queries by users.
|
|
It handles:
|
|
|
|
* 'and', 'or' and implicit 'and' operators;
|
|
* parentheses;
|
|
* quoted strings;
|
|
* wildcards at the end of a search term (help*);
|
|
|
|
Requirements:
|
|
* Python
|
|
* Pyparsing
|
|
|
|
If you run this script, it will perform a number of tests. To use is as a
|
|
module, you should use inheritance on the SearchQueryParser class and overwrite
|
|
the Get... methods. The ParserTest class gives a very simple example of how this
|
|
could work.
|
|
|
|
-------------------------------------------------------------------------------
|
|
Copyright (c) 2006, Estrate, the Netherlands
|
|
All rights reserved.
|
|
|
|
Redistribution and use in source and binary forms, with or without modification,
|
|
are permitted provided that the following conditions are met:
|
|
|
|
* Redistributions of source code must retain the above copyright notice, this
|
|
list of conditions and the following disclaimer.
|
|
* Redistributions in binary form must reproduce the above copyright notice,
|
|
this list of conditions and the following disclaimer in the documentation
|
|
and/or other materials provided with the distribution.
|
|
* Neither the name of Estrate nor the names of its contributors may be used
|
|
to endorse or promote products derived from this software without specific
|
|
prior written permission.
|
|
|
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
|
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
|
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
|
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
|
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
|
|
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
CONTRIBUTORS:
|
|
- Steven Mooij
|
|
- Rudolph Froger
|
|
- Paul McGuire
|
|
"""
|
|
from pyparsing import Word, alphanums, Keyword, Group, Combine, Forward
|
|
from pyparsing import Suppress, Optional, OneOrMore, oneOf, Or
|
|
|
|
class SearchQueryParser:
|
|
|
|
def __init__(self):
|
|
self._methods = {
|
|
'and': self.evaluateAnd,
|
|
'or': self.evaluateOr,
|
|
'not': self.evaluateNot,
|
|
'parenthesis': self.evaluateParenthesis,
|
|
'quotes': self.evaluateQuotes,
|
|
'word': self.evaluateWord,
|
|
'wordwildcard': self.evaluateWordWildcard,
|
|
}
|
|
self._parser = self.parser()
|
|
|
|
def parser(self):
|
|
"""
|
|
This function returns a parser.
|
|
The grammar should be like most full text search engines (Google, Tsearch, Lucene).
|
|
|
|
Grammar:
|
|
- a query consists of alphanumeric words, with an optional '*' wildcard
|
|
at the end of a word
|
|
- a sequence of words between quotes is a literal string
|
|
- words can be used together by using operators ('and' or 'or')
|
|
- words with operators can be grouped with parenthesis
|
|
- a word or group of words can be preceded by a 'not' operator
|
|
- the 'and' operator precedes an 'or' operator
|
|
- if an operator is missing, use an 'and' operator
|
|
"""
|
|
operatorOr = Forward()
|
|
|
|
operatorWord = Group(Combine(Word(alphanums) + Suppress('*'))).setResultsName('wordwildcard') | \
|
|
Group(Word(alphanums)).setResultsName('word')
|
|
|
|
operatorQuotesContent = Forward()
|
|
operatorQuotesContent << (
|
|
(operatorWord + operatorQuotesContent) | operatorWord
|
|
)
|
|
|
|
operatorQuotes = Group(
|
|
Or([Suppress('"') + operatorQuotesContent + Suppress('"'),
|
|
Suppress('\'') + operatorQuotesContent + Suppress('\'')]
|
|
)).setResultsName("quotes") | operatorWord
|
|
|
|
operatorParenthesis = Group(
|
|
(Suppress("(") + operatorOr + Suppress(")"))
|
|
).setResultsName("parenthesis") | operatorQuotes
|
|
|
|
operatorNot = Forward()
|
|
operatorNot << (Group(
|
|
Suppress(Keyword("not", caseless=True)) + operatorNot
|
|
).setResultsName("not") | operatorParenthesis)
|
|
|
|
operatorAnd = Forward()
|
|
operatorAnd << (Group(
|
|
operatorNot + Suppress(Keyword("and", caseless=True)) + operatorAnd
|
|
).setResultsName("and") | Group(
|
|
operatorNot + OneOrMore(~oneOf("and or") + operatorAnd)
|
|
).setResultsName("and") | operatorNot)
|
|
|
|
operatorOr << (Group(
|
|
operatorAnd + Suppress(Keyword("or", caseless=True)) + operatorOr
|
|
).setResultsName("or") | operatorAnd)
|
|
|
|
return operatorOr.parseString
|
|
|
|
def evaluateAnd(self, argument):
|
|
return self.evaluate(argument[0]).intersection(self.evaluate(argument[1]))
|
|
|
|
def evaluateOr(self, argument):
|
|
return self.evaluate(argument[0]).union(self.evaluate(argument[1]))
|
|
|
|
def evaluateNot(self, argument):
|
|
return self.GetNot(self.evaluate(argument[0]))
|
|
|
|
def evaluateParenthesis(self, argument):
|
|
return self.evaluate(argument[0])
|
|
|
|
def evaluateQuotes(self, argument):
|
|
"""Evaluate quoted strings
|
|
|
|
First is does an 'and' on the indidual search terms, then it asks the
|
|
function GetQuoted to only return the subset of ID's that contain the
|
|
literal string.
|
|
"""
|
|
r = set()
|
|
search_terms = []
|
|
for item in argument:
|
|
search_terms.append(item[0])
|
|
if len(r) == 0:
|
|
r = self.evaluate(item)
|
|
else:
|
|
r = r.intersection(self.evaluate(item))
|
|
return self.GetQuotes(' '.join(search_terms), r)
|
|
|
|
def evaluateWord(self, argument):
|
|
return self.GetWord(argument[0])
|
|
|
|
def evaluateWordWildcard(self, argument):
|
|
return self.GetWordWildcard(argument[0])
|
|
|
|
def evaluate(self, argument):
|
|
return self._methods[argument.getName()](argument)
|
|
|
|
def Parse(self, query):
|
|
#print self._parser(query)[0]
|
|
return self.evaluate(self._parser(query)[0])
|
|
|
|
def GetWord(self, word):
|
|
return set()
|
|
|
|
def GetWordWildcard(self, word):
|
|
return set()
|
|
|
|
def GetQuotes(self, search_string, tmp_result):
|
|
return set()
|
|
|
|
def GetNot(self, not_set):
|
|
return set().difference(not_set)
|
|
|
|
|
|
class ParserTest(SearchQueryParser):
|
|
"""Tests the parser with some search queries
|
|
tests containts a dictionary with tests and expected results.
|
|
"""
|
|
tests = {
|
|
'help': set([1, 2, 4, 5]),
|
|
'help or hulp': set([1, 2, 3, 4, 5]),
|
|
'help and hulp': set([2]),
|
|
'help hulp': set([2]),
|
|
'help and hulp or hilp': set([2, 3, 4]),
|
|
'help or hulp and hilp': set([1, 2, 3, 4, 5]),
|
|
'help or hulp or hilp or halp': set([1, 2, 3, 4, 5, 6]),
|
|
'(help or hulp) and (hilp or halp)': set([3, 4, 5]),
|
|
'help and (hilp or halp)': set([4, 5]),
|
|
'(help and (hilp or halp)) or hulp': set([2, 3, 4, 5]),
|
|
'not help': set([3, 6, 7, 8]),
|
|
'not hulp and halp': set([5, 6]),
|
|
'not (help and halp)': set([1, 2, 3, 4, 6, 7, 8]),
|
|
'"help me please"': set([2]),
|
|
'"help me please" or hulp': set([2, 3]),
|
|
'"help me please" or (hulp and halp)': set([2]),
|
|
'help*': set([1, 2, 4, 5, 8]),
|
|
'help or hulp*': set([1, 2, 3, 4, 5]),
|
|
'help* and hulp': set([2]),
|
|
'help and hulp* or hilp': set([2, 3, 4]),
|
|
'help* or hulp or hilp or halp': set([1, 2, 3, 4, 5, 6, 8]),
|
|
'(help or hulp*) and (hilp* or halp)': set([3, 4, 5]),
|
|
'help* and (hilp* or halp*)': set([4, 5]),
|
|
'(help and (hilp* or halp)) or hulp*': set([2, 3, 4, 5]),
|
|
'not help* and halp': set([6]),
|
|
'not (help* and helpe*)': set([1, 2, 3, 4, 5, 6, 7]),
|
|
'"help* me please"': set([2]),
|
|
'"help* me* please" or hulp*': set([2, 3]),
|
|
'"help me please*" or (hulp and halp)': set([2]),
|
|
'"help me please" not (hulp and halp)': set([2]),
|
|
'"help me please" hulp': set([2]),
|
|
'\'help me please\' hulp': set([2]),
|
|
'help and hilp and not holp': set([4]),
|
|
'help hilp not holp': set([4]),
|
|
'help hilp and not holp': set([4]),
|
|
}
|
|
|
|
docs = {
|
|
1: 'help',
|
|
2: 'help me please hulp',
|
|
3: 'hulp hilp',
|
|
4: 'help hilp',
|
|
5: 'halp thinks he needs help',
|
|
6: 'he needs halp',
|
|
7: 'nothing',
|
|
8: 'helper',
|
|
}
|
|
|
|
index = {
|
|
'help': set((1, 2, 4, 5)),
|
|
'me': set((2,)),
|
|
'please': set((2,)),
|
|
'hulp': set((2, 3,)),
|
|
'hilp': set((3, 4,)),
|
|
'halp': set((5, 6,)),
|
|
'thinks': set((5,)),
|
|
'he': set((5, 6,)),
|
|
'needs': set((5, 6,)),
|
|
'nothing': set((7,)),
|
|
'helper': set((8,)),
|
|
}
|
|
|
|
def GetWord(self, word):
|
|
if (self.index.has_key(word)):
|
|
return self.index[word]
|
|
else:
|
|
return set()
|
|
|
|
def GetWordWildcard(self, word):
|
|
result = set()
|
|
for item in self.index.keys():
|
|
if word == item[0:len(word)]:
|
|
result = result.union(self.index[item])
|
|
return result
|
|
|
|
def GetQuotes(self, search_string, tmp_result):
|
|
result = set()
|
|
for item in tmp_result:
|
|
if self.docs[item].count(search_string):
|
|
result.add(item)
|
|
return result
|
|
|
|
def GetNot(self, not_set):
|
|
all = set(self.docs.keys())
|
|
return all.difference(not_set)
|
|
|
|
def Test(self):
|
|
all_ok = True
|
|
for item in self.tests.keys():
|
|
print item
|
|
r = self.Parse(item)
|
|
e = self.tests[item]
|
|
print 'Result: %s' % r
|
|
print 'Expect: %s' % e
|
|
if e == r:
|
|
print 'Test OK'
|
|
else:
|
|
all_ok = False
|
|
print '>>>>>>>>>>>>>>>>>>>>>>Test ERROR<<<<<<<<<<<<<<<<<<<<<'
|
|
print ''
|
|
return all_ok
|
|
|
|
if __name__=='__main__':
|
|
if ParserTest().Test():
|
|
print 'All tests OK'
|
|
else:
|
|
print 'One or more tests FAILED'
|