由于数据将被多次搜索,我首先解析字符串以便多次搜索 = 参见by_date
.
我使用二进制搜索来查找特定日期的第一个字符串,然后迭代增加次数,在filtered
函数变量中收集适当的字符串strings_between
。
# -*- coding: utf-8 -*-
"""
https://stackoverflow.com/questions/67562250/fastest-string-filtering-algorithm
Created on Tue May 18 09:20:11 2021
@author: Paddy3118
"""
strings = """\
John.Howard.12-11-2020 13:14
Diane.Barry.29-07-2020 20:50
Joseph.Ferns.08-05-2020 08:02
Joseph.Ferns.02-03-2020 05:09
Josephine.Fernie.01-01-2020 07:20
Alex.Alexander.06-06-2020 10:10
Howard.Jennings.07-07-2020 13:17
Hannah.Johnson.08-08-2020 00:49
Josephine.Fernie.08-08-2020 07:20
Alex.Alexander.08-08-2020 10:10
Howard.Jennings.08-08-2020 13:17
Hannah.Johnson.08-08-2020 09:49\
"""
## First parse the date information once for all future range calcs
def to_mins(hr_mn='00:00'):
hr, mn = hr_mn.split(':')
return int(hr) * 60 + int(mn)
by_date = dict() # Keys are individual days, values are time-sorted
for s in strings.split('\n'):
name_day, time = s.strip().split()
name, day = name_day.rsplit('.', 1)
minutes = to_mins(time)
if day not in by_date:
by_date[day] = [(minutes, s)]
else:
by_date[day].append((minutes, s))
for day_info in by_date.values():
day_info.sort()
## Now rely on dict search for day then binary +linear search within day.
def _bisect_left(a, x):
"""Return the index where to insert item x in list a, assuming a is sorted.
The return value i is such that all e in a[:i] have e < x, and all e in
a[i:] have e >= x. So if x already appears in the list, a.insert(x) will
insert just before the leftmost x already there.
'a' is a list of tuples whose first item is assumed sorted and searched apon.
"""
lo, hi = 0, len(a)
while lo < hi:
mid = (lo+hi)//2
# Use __lt__ to match the logic in list.sort() and in heapq
if a[mid][0] < x: lo = mid+1
else: hi = mid
return lo
def strings_between(day="01-01-2020", start="00:00", finish="23:59"):
global by_date
if day not in by_date:
return []
day_data = by_date[day]
start, finish = to_mins(start), to_mins(finish)
from_index = _bisect_left(day_data, start)
filtered = []
for time, s in day_data[from_index:]:
if time <= finish:
filtered.append(s)
else:
break
return filtered
## Example data
assert by_date == {
'12-11-2020': [(794, 'John.Howard.12-11-2020 13:14')],
'29-07-2020': [(1250, 'Diane.Barry.29-07-2020 20:50')],
'08-05-2020': [(482, 'Joseph.Ferns.08-05-2020 08:02')],
'02-03-2020': [(309, 'Joseph.Ferns.02-03-2020 05:09')],
'01-01-2020': [(440, 'Josephine.Fernie.01-01-2020 07:20')],
'06-06-2020': [(610, 'Alex.Alexander.06-06-2020 10:10')],
'07-07-2020': [(797, 'Howard.Jennings.07-07-2020 13:17')],
'08-08-2020': [(49, 'Hannah.Johnson.08-08-2020 00:49'),
(440, 'Josephine.Fernie.08-08-2020 07:20'),
(589, 'Hannah.Johnson.08-08-2020 09:49'),
(610, 'Alex.Alexander.08-08-2020 10:10'),
(797, 'Howard.Jennings.08-08-2020 13:17')]}
## Example queries from command line
"""
In [7]: strings_between('08-08-2020')
Out[7]:
['Hannah.Johnson.08-08-2020 00:49',
'Josephine.Fernie.08-08-2020 07:20',
'Hannah.Johnson.08-08-2020 09:49',
'Alex.Alexander.08-08-2020 10:10',
'Howard.Jennings.08-08-2020 13:17']
In [8]: strings_between('08-08-2020', '09:30', '24:00')
Out[8]:
['Hannah.Johnson.08-08-2020 09:49',
'Alex.Alexander.08-08-2020 10:10',
'Howard.Jennings.08-08-2020 13:17']
In [9]: strings_between('08-08-2020', '09:49', '10:10')
Out[9]: ['Hannah.Johnson.08-08-2020 09:49', 'Alex.Alexander.08-08-2020 10:10']
In [10]:
"""