0%

智能提示算法前缀树及一系列优化

智能提示
前缀树
dfs
回溯
多叉树迭代dfs
插入排序的选取

题干

智能提示的典型实现方式是:
在系统后台维护一个字典,当用户输入字符时,在字典中查找以用户当前输入的字符串为前缀的全部单词,选取其中历史使用率最高的若干单词作为候选词,建议给用户.
备注:这里约定一个字符串不能称为自己的前缀。若用户输入的字符串恰好是字典中的一个单词,则该单词不必向用户建议

输入格式

输入第一行为3个正整数n、m、k。n为字典中单词个数。m为用户查询数,即用户输入的单词个数。对于用户输入的每个字符串,程序需要返回字典中以该字符串为前缀的、历史使用频率最高的k个单词。接下来n行,表示字典信息,每行为1个整数和1个字符串,整数表示单词的历史使用频率,字符串表示单词,请注意,单词包含的每个字符为a-z的小写字母或0-9的数字,即数字也可能构成字典中的单词。字典内的单词并非按使用频率有序存放。接下来m行,表示用户的查询,每行为一个a-z的小写字母或0-9的数字组成的字符串,表示用户的查询。另外请注意,由于字典往往是在用户历史数据的基础上加工而得,所以字典中可能出现重复单词,若某个单词在字典中出现多次,则其历史使用频率以最高者为准。 (n ≤ 10000, m ≤ 20000, k ≤ 10, 每个单词长度不超过20,单词历史使用频率小于$2^{31}$)

输出格式:

对于用户输入的每个字符串,按使用频率降序输出字典中以该字符串为前缀的、历史使用频率最高的k个单词,每个占1行。若多个单词历史使用频率相同,则字典序靠前的单词排名靠前。若单词中包含数字,则字典序以ACSII码判定,即0<1<2<…<9<a<b<c<…<z。若字典中满足输出条件的单词个数大于0小于k,则有多少就输出多少个。若字典中没有满足输出条件的单词,则输出“no suggestion”。针对用户每个查询所输出的信息,用空行间隔。

输入样例

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
20 3 4
1827187200 the
1595609600 to
1107331800 that
401542500 this
334039800 they
282026500 their
250991700 them
196118888 these
150877900 than
144968100 time
125563600 then
109336600 two
196120000 there
87862100 those
79292500 through
75885600 the
71578000 think
67462300 2
65648600 tx356
57087700 though
th
xxx
the

输出样例

1
2
3
4
5
6
7
8
9
10
11
the
that
this
they

no suggestion

they
their
them
there

分析1

一眼前缀树,搜索时找到用户输入的前缀perfix最后一个字母对应的节点后对其子节点进行dfs

PTA测试时间450ms

实现:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#include <bits/stdc++.h>
using namespace std;
#define char2pos(ch) ((ch >= '0' && ch <= '9') ? ch - '0' : ch - 'a' + 10)
#define pos2char(pos) ((pos >= 0 && pos <= 9) ? pos + '0' : pos - 10 + 'a')
class trietree {
public:
struct Node {
int perfix = 0;
int freq = 0;
Node *next[36] = {NULL};
};
void insert(char *str, int freq) {
Node *cur = &root;
for (int i = 0; i < strlen(str); i++) {
int pos = char2pos(str[i]);
if (!cur->next[pos]) {
cur->next[pos] = new Node;
}
cur = cur->next[pos];
cur->perfix++;
}
cur->freq = cur->freq > freq ? cur->freq : freq;
}
vector<pair<int, string>> query(char *perfix) { // 返回的vec不用加上perfix
Node *perfix_end_node = &root;
for (int i = 0; i < strlen(perfix); i++) {
int pos = char2pos(perfix[i]);
if (perfix_end_node->next[pos]) {
perfix_end_node = perfix_end_node->next[pos];
} else {
return {};
}
}
bool perfix_end_node_has_son = false;
{ // dfs
vector<pair<int, string>> result;
auto push = [&](char *str, int freq) {
int i = 0;
while (i < result.size() && result[i].first >= freq) {
i++;
}
result.insert(result.begin() + i, {freq, str});
};
for (int i = 0; i < 36; i++) {
if (perfix_end_node->next[i]) {
perfix_end_node_has_son = true;
Node *cur = perfix_end_node->next[i];
char path[21] = {0};
int pathend = 0;
function<void(Node *, char)> dfsRe = [&](Node *root, char ch) {
if (!root)
return;
path[pathend++] = ch;
if (root->freq) {
path[pathend] = 0;
push(path, root->freq);
}
for (int i = 0; i < 36; i++) {
dfsRe(root->next[i], pos2char(i));
}
pathend--;
};
dfsRe(cur, pos2char(i));
}
}
if (!perfix_end_node_has_son) {
return {};
}
return result;
}
}
Node root;
};
int main(int argc, char *argv[]) {
int N, M, K;
scanf("%d", &N);
scanf("%d", &M);
scanf("%d", &K);
trietree t;
char buf[21];
int freq = INT_MIN;
for (int i = 0; i < N; i++) {
scanf("%d", &freq);
scanf("%s", buf);
t.insert(buf, freq);
}
for (int i = 0; i < M; i++) {
scanf("%s", buf);
auto result = t.query(buf);
if (result.size()) {
for (int i = 0, num = K; i < result.size() && num > 0; i++, num--) {
printf("%s%s\n", buf,result[i].second.c_str());
}
} else {
printf("no suggestion\n");
}
if (i != M - 1)
printf("\n");
}
return 0;
}

分析2

分析1代码提交后发现时间复杂度太高,遂想到可以对递归剪枝

用户重复查询同一个perfix会对同一dfs重复多次
用户查询th than thana 等重复dfs了thana的a节点
于是将算法改为在读入字典数据后立即对每一个节点构造其suggestions

PTA测试时间30ms

实现:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#include <bits/stdc++.h>
using namespace std;
#define char2pos(ch) ((ch >= '0' && ch <= '9') ? ch - '0' : ch - 'a' + 10)
#define pos2char(pos) ((pos >= 0 && pos <= 9) ? pos + '0' : pos - 10 + 'a')
class trietree {
public:
struct Node {
int perfix = 0;
int freq = 0; // 频率,频率为0代表没有以此结束的str
Node *next[36] = {NULL};
vector<int> nextvec; // 为了避免36次循环设立的数组
void vecpush(int pos) { // nextvec数组对应的push函数(为了保证字典序)
int cur = 0;
while (cur < nextvec.size() && nextvec[cur] < pos) {
cur++;
}
nextvec.insert(nextvec.begin() + cur, pos);
}
vector<pair<string, int>> suggestions; // str,freq
};
void insert(char *str, int freq) { // 正常的insert
int len = strlen(str);
if (!len)
return;
Node *cur = &root;
for (int i = 0; i < len; i++) {
int pos = char2pos(str[i]);
if (!cur->next[pos]) {
cur->next[pos] = new Node;
cur->vecpush(pos);
}
cur = cur->next[pos];
cur->perfix++;
}
cur->freq = cur->freq > freq ? cur->freq : freq; // 同一单词出现多次
}
// void suggestion_push_end(Node *node, char *str, int freq) {
void suggestion_push(Node *node, string str, int freq) {
int i = 0;
while (i < node->suggestions.size() &&
node->suggestions[i].second >= freq) {
i++;
if (i + 1 > max_suggestion_count)
return;
}
if (i + 1 <= max_suggestion_count)
node->suggestions.insert(node->suggestions.begin() + i, {str, freq});
}
char buildpath[21];
int buildpathend = 0;
void build_suggestions(Node *root) {
if (!root) {
return;
}
// 每个循环内字典序必然是升序,循环间也是
for (auto it : root->nextvec) {
Node *son = root->next[it];
buildpath[buildpathend++] = pos2char(it);
if (son->freq) { // 此时自己的sug为空
// push自己
buildpath[buildpathend] = 0;
suggestion_push(root, buildpath, son->freq);
}
build_suggestions(son);
for (auto sonsug : son->suggestions) {
suggestion_push(root, sonsug.first, sonsug.second);
}
buildpathend--;
}
}
// 查询函数,返回以perfix开头的所有排好序的string,<int,string>中int是频率
vector<pair<string, int>> query(char *perfix, int num) {
Node *perfix_end_node = &root;
// 找到str结尾单词的节点perfix_end_node(例如the:e的节点)
for (int i = 0; i < strlen(perfix); i++) {
int pos = char2pos(perfix[i]);
if (perfix_end_node->next[pos]) {
perfix_end_node = perfix_end_node->next[pos];
} else {
return {};
}
}
return perfix_end_node->suggestions;
}
Node root;
int max_suggestion_count = 0;
};
int main(int argc, char *argv[]) {
int N, M, K;
scanf("%d", &N);
scanf("%d", &M);
scanf("%d", &K);
trietree t;
t.max_suggestion_count = K;
char buf[21];
int freq;
for (int i = 0; i < N; i++) {
scanf("%d", &freq);
scanf("%s", buf);
t.insert(buf, freq);
}
t.build_suggestions(&t.root);
for (int i = 0; i < M; i++) {
scanf("%s", buf);
int num = K;
auto result = t.query(buf, K);
if (result.size()) {
for (auto &&it : result) {
if (!num)
break;
printf("%s\n",it.first.c_str());
num--;
}
} else {
printf("no suggestion\n");
}
if (i != M - 1)
printf("\n");
}
return 0;
}

分析3(from 2)

若不是投入使用,仅仅为了通过题目,可以进行以下优化
在用户输入一系列perfix时有可能某些节点从未被访问
因此可以在输入一个perfix后再对该perfix最后一个字母的节点进行dfs建立suggestions

实现:无

分析4 (from ?)

其实大可不必dfs过来dfs过去的,还回溯个毛线啊,直接在insert的时候把沿途路径上每个节点的suggestions更新了就行了

PTA测试时间30ms

实现:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#include <bits/stdc++.h>
using namespace std;
#define char2pos(ch) ((ch >= '0' && ch <= '9') ? ch - '0' : ch - 'a' + 10)
#define pos2char(pos) ((pos >= 0 && pos <= 9) ? pos + '0' : pos - 10 + 'a')
class trietree {
public:
struct Node {
int perfix = 0;
int freq = 0; // 频率,频率为0代表没有以此结束的str
Node *next[36] = {NULL};
vector<int> nextvec; // 为了避免36次循环设立的数组
void vecpush(int pos) { // nextvec数组对应的push函数(为了保证字典序)
int cur = 0;
while (cur < nextvec.size() && nextvec[cur] < pos) {
cur++;
}
nextvec.insert(nextvec.begin() + cur, pos);
}
vector<pair<string, int>> suggestions; // str,freq
};
void insert(char *str, int freq) {
int len = strlen(str);
if (!len)
return;
Node *cur = &root;
for (int i = 0; i < len; i++) {
int pos = char2pos(str[i]);
if (!cur->next[pos]) {
cur->next[pos] = new Node;
cur->vecpush(pos);
}
cur = cur->next[pos];
if (i != len - 1) {
suggestion_push(cur, str, freq);
}
cur->perfix++;
}
cur->freq = cur->freq > freq ? cur->freq : freq; // 同一单词出现多次
}
void suggestion_push(Node *node, string str,int freq) {
int i = 0;
auto curvec = &node->suggestions;
int inserted = false;
for (auto it = curvec->begin(); it != curvec->end(); it++) {// 同一单词出现多次
if ((*it).first == str) {
(*it).second = (*it).second > freq ? (*it).second : freq;
inserted = true;
}
}
if (!inserted) {
curvec->push_back({str, freq});
}
sort(curvec->begin(), curvec->end(),
[](pair<string, int> &a, pair<string, int> &b) {
if (a.second > b.second)
return true;
if (a.second < b.second)
return false;
return a.first < b.first;
});//此时sort的性能应该不咋地,但懒得优化了
if(curvec->size()>max_suggestion_count)
curvec->erase(curvec->begin()+max_suggestion_count,curvec->end());
}
// 查询函数,返回以perfix开头的所有排好序的string,<int,string>中int是频率
vector<pair<string, int>> query(char *perfix, int num) {
Node *perfix_end_node = &root;
// 找到str结尾单词的节点perfix_end_node(例如the:e的节点)
for (int i = 0; i < strlen(perfix); i++) {
int pos = char2pos(perfix[i]);
if (perfix_end_node->next[pos]) {
perfix_end_node = perfix_end_node->next[pos];
} else {
return {};
}
}
return perfix_end_node->suggestions;
}
Node root;
int max_suggestion_count = 0;
};
int main(int argc, char *argv[]) {
int N, M, K;
scanf("%d", &N);
scanf("%d", &M);
scanf("%d", &K);
trietree t;
t.max_suggestion_count = K;
char buf[21];
int freq;
for (int i = 0; i < N; i++) {
scanf("%d", &freq);
scanf("%s", buf);
t.insert(buf, freq);
}
for (int i = 0; i < M; i++) {
scanf("%s", buf);
auto result = t.query(buf, K);
if (result.size()) {
for (int i = 0, num = K; i < result.size() && num > 0; i++, num--) {
printf("%s\n", result[i].first.c_str());
}
} else {
printf("no suggestion\n");
}
if (i != M - 1)
printf("\n");
}
return 0;
}

CSDN java实现

扩展 当需要处理的字符不仅仅是什么26个英文字母的时候,甚至出现中文等语言的时候如何处理?

????怎么实现啊?????
编码???


额外收获

  • cout和printf性能对比
  • 多叉树(一般树)的dfs及其递归转迭代思路: whileleftin;while(stack)(popandpushnext,whileleftin)
  • 递归转栈大概率不会减少时间,申请系统的用户栈和自己模拟栈性能似乎差不多?(至少不会带来明显的性能提升)[待证]

其他

  • int perfix = 0;在本题是没用的
  • nextvec纯纯负优化,不如整个map<int,Node*>,也能自动排序,也能避免36次循环,不亦乐乎?
  • 另外附上分析1栈版本代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
#include <bits/stdc++.h>
using namespace std;
#define char2pos(ch) ((ch >= '0' && ch <= '9') ? ch - '0' : ch - 'a' + 10)
#define pos2char(pos) ((pos >= 0 && pos <= 9) ? pos + '0' : pos - 10 + 'a')
class trietree {
public:
struct Node {
int perfix = 0;
int freq = 0;//频率,频率为0代表没有以此结束的str
Node *next[36] = {NULL};
vector<int> nextvec;//为了避免36次循环设立的数组
void vecpush(int pos) {//nextvec数组对应的push函数(为了保证字典序)
int cur = 0;
while (cur < nextvec.size() && nextvec[cur] < pos) {
cur++;
}
nextvec.insert(nextvec.begin() + cur, pos);
}
};
void insert(char *str, int freq) {//正常的insert
int len = strlen(str);
if (!len)
return;
Node *cur = &root;
for (int i = 0; i < len; i++) {
int pos = char2pos(str[i]);
if (!cur->next[pos]) {
cur->next[pos] = new Node;
cur->vecpush(pos);
}
cur = cur->next[pos];
cur->perfix++;
}
cur->freq = cur->freq > freq ? cur->freq : freq; // 同一单词出现多次
}

vector<pair<int, string>> query(char *perfix,int num) {//查询函数,返回以perfix开头的所有排好序的string,<int,string>中int是频率
Node *perfix_end_node = &root;
for (int i = 0; i < strlen(perfix); i++) {//找到str结尾单词的节点perfix_end_node(例如the:e的节点)
int pos = char2pos(perfix[i]);
if (perfix_end_node->next[pos]) {
perfix_end_node = perfix_end_node->next[pos];
} else {
return {};
}
}
if (!perfix_end_node->nextvec.size()) {
return {};
}
{ // dfs
vector<pair<int, string>> result;
auto push = [&](char *str, int freq) {//result(函数返回结果)的push函数,保证vector顺序的
int i = 0;
while (i < result.size() && result[i].first >= freq) {
i++;
}if(i+1<=num)
result.insert(result.begin() + i, {freq, str});
};

function<void(Node *)> dfs = [&](Node *root) {//dfs
if (!root)
return;
char path[21] = {0};
int pathend = 0;
stack<pair<Node *, int>> stack; // pos是nextvec的pos
while (root) {//循环(root=root->left)push进去root及其第一个子树
stack.push({root, 0});
if (root->nextvec.size()) {

path[pathend++] = pos2char(root->nextvec[0]);
root = root->next[root->nextvec[0]];
if (root->freq) {
path[pathend] = 0;
push(path, root->freq);
}
} else
break;
}
while (stack.size()) {
auto [top, pos] = stack.top();
stack.pop();
pathend--;
if (!stack.size()) {
break;
} else {
if (pos + 1 <= stack.top().first->nextvec.size() - 1) {
auto next =
stack.top().first->next[stack.top().first->nextvec[pos + 1]];

path[pathend++] = pos2char(stack.top().first->nextvec[pos + 1]);
if (next->freq) {
path[pathend] = 0;
push(path, next->freq);
}
stack.push({next, pos + 1});
while (next->nextvec.size()) {
int pos = next->nextvec[0];
auto node = next->next[pos];
stack.push({node, 0});

path[pathend++] = pos2char(pos);
if (node->freq) {
path[pathend] = 0;
push(path, node->freq);
}
next = node;
}
}
}
}
};
for (auto it : perfix_end_node->nextvec) {//对perfix_end_node每个son进行dfs
Node tmp = {
.perfix = 0,
.freq = 0,
.next = {NULL},
.nextvec = {it},
};
tmp.next[it] = perfix_end_node->next[it];
dfs(&tmp);
}

return result;
}
}
Node root;
};
int main(int argc, char *argv[]) {
int N, M, K;
scanf("%d", &N);
scanf("%d", &M);
scanf("%d", &K);
trietree t;

char buf[21];
int freq = INT_MIN;
for (int i = 0; i < N; i++) {
scanf("%d", &freq);
scanf("%s", buf);
t.insert(buf, freq);
}
for (int i = 0; i < M; i++) {
scanf("%s", buf);
int num = K;
auto result = t.query(buf, K);
if (result.size()) {
for (auto &&it : result) {
if (!num)
break;
printf("%s%s\n",buf,it.second.c_str());
num--;
}
} else {
printf("no suggestion\n");
}
if (i != M - 1)
printf("\n");
}
return 0;
}