mirror of https://github.com/chriskiehl/Gooey.git
Chris
4 years ago
7 changed files with 254 additions and 44 deletions
Split View
Diff Options
-
1gooey/__init__.py
-
0gooey/gui/components/filtering/__init__.py
-
118gooey/gui/components/filtering/prefix_filter.py
-
52gooey/gui/components/widgets/dropdown_filterable.py
-
31gooey/tests/test_filterable_dropdown.py
-
92gooey/tests/test_filtering.py
-
4gooey/util/functional.py
@ -0,0 +1,118 @@ |
|||
import re |
|||
|
|||
import pygtrie as trie |
|||
from functools import reduce |
|||
|
|||
__ALL__ = ('PrefixTokenizers', 'PrefixSearch') |
|||
|
|||
|
|||
|
|||
class PrefixTokenizers: |
|||
# This string here is just an arbitrary long string so that |
|||
# re.split finds no matches and returns the entire phrase |
|||
ENTIRE_PHRASE = '::gooey/tokenization/entire-phrase' |
|||
# \s == any whitespace character |
|||
WORDS = r'\s' |
|||
|
|||
@classmethod |
|||
def REGEX(cls, expression): |
|||
return expression |
|||
|
|||
|
|||
|
|||
class SearchOptions: |
|||
def __init__(self, |
|||
choice_tokenizer=PrefixTokenizers.ENTIRE_PHRASE, |
|||
input_tokenizer=PrefixTokenizers.ENTIRE_PHRASE, |
|||
ignore_case=True, |
|||
operator='AND', |
|||
index_suffix= False, |
|||
**kwargs): |
|||
self.choice_tokenizer = choice_tokenizer |
|||
self.input_tokenizer = input_tokenizer |
|||
self.ignore_case = ignore_case |
|||
self.operator = operator |
|||
self.index_suffix = index_suffix |
|||
|
|||
|
|||
|
|||
class PrefixSearch(object): |
|||
""" |
|||
A trie backed index for quickly finding substrings |
|||
in a list of options. |
|||
""" |
|||
|
|||
def __init__(self, choices, options={}, *args, **kwargs): |
|||
self.choices = sorted(filter(None, choices)) |
|||
self.options: SearchOptions = SearchOptions(**options) |
|||
self.searchtree = self.buildSearchTrie(choices) |
|||
|
|||
def updateChoices(self, choices): |
|||
self.choices = sorted(filter(None, choices)) |
|||
self.searchtree = trie.Trie() |
|||
|
|||
def findMatches(self, token): |
|||
if not token: |
|||
return sorted(self.choices) |
|||
tokens = self.tokenizeInput(token) |
|||
matches = [set(flatten(self._vals(self.searchtree, prefix=t))) for t in tokens] |
|||
op = intersection if self.options.operator == 'AND' else union |
|||
return sorted(reduce(op, matches)) |
|||
|
|||
def tokenizeInput(self, token): |
|||
""" |
|||
Cleans and tokenizes the user's input. |
|||
|
|||
empty characters and spaces are trimmed to prevent |
|||
matching all paths in the index. |
|||
""" |
|||
return list(filter(None, re.split(self.options.input_tokenizer, self.clean(token)))) |
|||
|
|||
def tokenizeChoice(self, choice): |
|||
""" |
|||
Splits the `choice` into a series of tokens based on |
|||
the user's criteria. |
|||
|
|||
If suffix indexing is enabled, the individual tokens |
|||
are further broken down and indexed by their suffix offsets. e.g. |
|||
|
|||
'Banana', 'anana', 'nana', 'ana' |
|||
""" |
|||
choice_ = self.clean(choice) |
|||
tokens = re.split(self.options.choice_tokenizer, choice_) |
|||
if self.options.index_suffix: |
|||
return [token[i:] |
|||
for token in tokens |
|||
for i in range(len(token) - 2)] |
|||
else: |
|||
return tokens |
|||
|
|||
def clean(self, text): |
|||
return text.lower() if self.options.ignore_case else text |
|||
|
|||
def buildSearchTrie(self, choices): |
|||
searchtrie = trie.Trie() |
|||
for choice in choices: |
|||
for token in self.tokenizeChoice(choice): |
|||
if not searchtrie.has_key(token): |
|||
searchtrie[token] = [] |
|||
searchtrie[token].append(choice) |
|||
return searchtrie |
|||
|
|||
def _vals(self, searchtrie, **kwargs): |
|||
try: |
|||
return searchtrie.values(**kwargs) |
|||
except KeyError: |
|||
return [] |
|||
|
|||
|
|||
def intersection(a, b): |
|||
return a.intersection(b) |
|||
|
|||
|
|||
def union(a, b): |
|||
return a.union(b) |
|||
|
|||
|
|||
def flatten(xs): |
|||
return [item for x in xs for item in x] |
@ -0,0 +1,92 @@ |
|||
import unittest |
|||
|
|||
from gooey import PrefixTokenizers |
|||
from gui.components.filtering.prefix_filter import SearchOptions, PrefixSearch |
|||
from collections import namedtuple |
|||
|
|||
TestData = namedtuple('TestData', [ |
|||
'options', |
|||
'input_string', |
|||
'expected_results', |
|||
]) |
|||
|
|||
Places = namedtuple('Places', [ |
|||
'kabul', |
|||
'tirana', |
|||
'kyoto', |
|||
'tokyo' |
|||
]) |
|||
|
|||
class TestPrefixFilter(unittest.TestCase): |
|||
|
|||
|
|||
def setUp(self): |
|||
self.testdata = Places( |
|||
'Afghanistan Kabul', |
|||
'Albania Tirana', |
|||
'Japan Kyoto', |
|||
'Japan Tokyo' |
|||
) |
|||
|
|||
def test_prefix_searching(self): |
|||
p = self.testdata |
|||
cases = [ |
|||
TestData({'ignore_case': True}, 'a', [p.kabul, p.tirana]), |
|||
TestData({'ignore_case': True}, 'A', [p.kabul, p.tirana]), |
|||
TestData({'ignore_case': False}, 'a', []), |
|||
TestData({'ignore_case': False}, 'A', [p.kabul, p.tirana]), |
|||
|
|||
# when using the phrase tokenizer, the search input must |
|||
# match starting from the beginning. So we find Afghanistan |
|||
TestData({'choice_tokenizer': PrefixTokenizers.ENTIRE_PHRASE}, 'Afghan', [p.kabul]), |
|||
# but we cannot look up Kyoto because the phrase begins with "Japan" |
|||
TestData({'choice_tokenizer': PrefixTokenizers.ENTIRE_PHRASE}, 'Kyoto', []), |
|||
# So if we start with "Japan K" it'll be returned |
|||
TestData({'choice_tokenizer': PrefixTokenizers.ENTIRE_PHRASE}, 'Japan K', [p.kyoto]), |
|||
|
|||
|
|||
|
|||
# word tokenizer will split on all whitespace and index |
|||
# each choice one for each UNIQUE word |
|||
# so passing in 'a' will match "Af" and "Al" as usual |
|||
TestData({'choice_tokenizer': PrefixTokenizers.WORDS}, 'a', [p.kabul, p.tirana]), |
|||
# but now we can also find Kyoto without prefixing "japan" as we'd |
|||
# need to do with the phrase tokenizer |
|||
TestData({'choice_tokenizer': PrefixTokenizers.WORDS}, 'kyo', [p.kyoto]), |
|||
|
|||
# if we tokenize the input, we're perform two searches against the index |
|||
# The default operator is AND, which means all the words in your search |
|||
# input must match the choice for it to count as as a hit. |
|||
# In this example, we index the choices under PHRASE, but set the input |
|||
# tokenizer to WORDS. Our input 'Japan K' gets tokenized to ['Japan', 'K'] |
|||
# There is no phrase which starts with Both "Japan" and "K" so we get no |
|||
# matches returned |
|||
TestData({'choice_tokenizer': PrefixTokenizers.ENTIRE_PHRASE, |
|||
'input_tokenizer': PrefixTokenizers.WORDS}, 'Japan K', []), |
|||
# Tokenize the choices by WORDS means we can now filter on both words |
|||
TestData({'choice_tokenizer': PrefixTokenizers.WORDS, |
|||
'input_tokenizer': PrefixTokenizers.WORDS}, 'Jap K', [p.kyoto]), |
|||
# the default AND behavior can be swapped to OR to facilitate matching across |
|||
# different records in the index. |
|||
TestData({'choice_tokenizer': PrefixTokenizers.WORDS, |
|||
'input_tokenizer': PrefixTokenizers.WORDS, |
|||
'operator': 'OR'}, 'Kyo Tok', [p.kyoto, p.tokyo]), |
|||
|
|||
# Turning on Suffix indexing allow matching anywhere within a word. |
|||
# Now 'kyo' will match both the beginning 'Kyoto' and substring 'ToKYO' |
|||
TestData({'choice_tokenizer': PrefixTokenizers.WORDS, |
|||
'input_tokenizer': PrefixTokenizers.WORDS, |
|||
'index_suffix': True}, 'kyo ', [p.kyoto, p.tokyo]), |
|||
|
|||
TestData({'choice_tokenizer': PrefixTokenizers.WORDS, |
|||
'input_tokenizer': PrefixTokenizers.WORDS, |
|||
'index_suffix': True}, 'j kyo ', [p.kyoto, p.tokyo]), |
|||
] |
|||
|
|||
for case in cases: |
|||
with self.subTest(case): |
|||
searcher = PrefixSearch(self.testdata, case.options) |
|||
result = searcher.findMatches(case.input_string) |
|||
self.assertEqual(result, case.expected_results) |
|||
|
|||
|
Write
Preview
Loading…
Cancel
Save