WRY

Where Are You?
You are on the brave land,
To experience, to remember...

0%

【算法】层次时间文件夹的范围查找

问题描述

最近遇到了查找数据的问题,简要描述如下:

数据被按照产生的时间,存储在对应的文件夹中,例如2022-11-12 22:33的数据被存放在2022/11/12/22/33的文件夹下,现在需要查找一个指定时间范围的数据,所以需要找出所有满足该时间范围的文件夹,例如查找2022-11-12 22:30-2022-11-12 22:33范围(左闭右开)的文件夹,那么结果如下:

2022/11/12/22/30, 2022/11/12/22/31, 2022/11/12/22/32

此外文件系统采用了远程文件系统,所以文件系统的操作都有不少的时间消耗,并且你只有两种API可以使用:

  • 列出一个文件夹下的所有子文件夹
  • 判断一个文件夹是否存在

请问如何快速找出指定范围所包含的数据文件夹?

More: 在一些场景下,如果某段时间没有产生数据,那么不会创建对应的文件夹,请确保大时间范围时的算法高效。

暴力

枚举时间范围内的所有文件夹,并判断文件夹是否存在

1
2
3
4
5
6
7
8
9
def find_folder_according_time_range_v1(st_time, end_time, root_folder=None):
root_folder = "./test_folder" if root_folder is None else root_folder
lt = []
while st_time < end_time:
folder_name = f"{root_folder}/{st_time.year}/{st_time.month}/{st_time.day}/{st_time.hour}/{st_time.minute}"
if os.path.exists(folder_name):
lt.append(folder_name)
st_time = st_time + timedelta(minutes=1)
return lt

递归判断

一层一层的判断文件夹下是否存在这个时间区间的数据,若存在,继续向下一层查找,若不存在放弃。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
def find_folder_according_time_range_v2(st_time, end_time, root_folder=None):
"""
left close and right open
"""
root_folder = "./test_folder" if root_folder is None else root_folder
lt = []

def get_by_level(t, level):
if level == 0:
return t.year
elif level == 1:
return t.month
elif level == 2:
return t.day
elif level == 3:
return t.hour
elif level == 4:
return t.minute
else:
assert False, "level is invalid"

def find(dir, level=0, eq_left=True, eq_right=True):
if level == 5:
lt.append(dir)
return
for sub_dir in os.listdir(dir):
if not eq_left and not eq_right:
# 两边都不贴边,保存所有
find(f"{dir}/{sub_dir}", level + 1, False, False)
else:
st_num = get_by_level(st_time, level)
end_num = get_by_level(end_time, level)
dir_num = int(sub_dir)
if (eq_left and dir_num < st_num):
# 左侧贴边且当前值还在左侧边界的左边,所以不考虑
continue
elif (eq_right and dir_num > end_num
and level < 4) or (eq_right and dir_num >= end_num
and level == 4):
# 右侧贴边且当前值还在右侧边界的右边,所以不考虑
continue
else:
tel = eq_left and (st_num == dir_num)
ter = eq_right and (dir_num == end_num)
find(f"{dir}/{sub_dir}", level + 1, tel, ter)

find(root_folder)
return lt

附录

完整相关代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
from datetime import datetime, timedelta
import os


def create_folder(st_time, end_time, root_folder=None):
root_folder = "./test_folder" if root_folder is None else root_folder

while st_time < end_time:
folder_name = f"{root_folder}/{st_time.year}/{st_time.month}/{st_time.day}/{st_time.hour}/{st_time.minute}"
os.makedirs(folder_name)
st_time += timedelta(minutes=1)


def find_folder_acc_time_range_v1(st_time, end_time, root_folder=None):
root_folder = "./test_folder" if root_folder is None else root_folder
lt = []
while st_time < end_time:
folder_name = f"{root_folder}/{st_time.year}/{st_time.month}/{st_time.day}/{st_time.hour}/{st_time.minute}"
if os.path.exists(folder_name):
lt.append(folder_name)
st_time = st_time + timedelta(minutes=1)
return lt


def find_folder_acc_time_range_v2(st_time, end_time, root_folder=None):
"""
left close and right open
"""
root_folder = "./test_folder" if root_folder is None else root_folder
lt = []

def get_by_level(t, level):
if level == 0:
return t.year
elif level == 1:
return t.month
elif level == 2:
return t.day
elif level == 3:
return t.hour
elif level == 4:
return t.minute
else:
assert False, "level is invalid"

def find(dir, level=0, eq_left=True, eq_right=True):
if level == 5:
lt.append(dir)
return
for sub_dir in os.listdir(dir):
if not eq_left and not eq_right:
# save all
find(f"{dir}/{sub_dir}", level + 1, False, False)
else:
st_num = get_by_level(st_time, level)
end_num = get_by_level(end_time, level)
dir_num = int(sub_dir)
if (eq_left and dir_num < st_num):
continue
elif (eq_right and dir_num > end_num
and level < 4) or (eq_right and dir_num >= end_num
and level == 4):
continue
else:
tel = eq_left and (st_num == dir_num)
ter = eq_right and (dir_num == end_num)
find(f"{dir}/{sub_dir}", level + 1, tel, ter)

find(root_folder)
return lt


if __name__ == "__main__":
st = datetime.strptime("2022-11-12 20:46", "%Y-%m-%d %H:%M")
ed = datetime.strptime("2022-12-13 21:59", "%Y-%m-%d %H:%M")
create_folder(st, ed)
st = datetime.strptime("2022-11-12 11:58", "%Y-%m-%d %H:%M")
ed = datetime.strptime("2022-12-12 11:59", "%Y-%m-%d %H:%M")
res_v1 = find_folder_acc_time_range_v1(st, ed)
res_v2 = find_folder_acc_time_range_v2(st, ed)
print(set(res_v1) == set(res_v2))
print("res_v1 - res_v2", set(res_v1) - set(res_v2))
print("res_v2 - res_v1", set(res_v2) - set(res_v1))