0x20字符串-(4)-Trie - Be Grinder,for Better

Trie(字典树)

Trie是一种用来高效存储和查找字符串集合的数据结构。

字母的类型不会很多：要么全是小写字母，要么全是大写字母，要么全是数字，要么全是01。

插入(Insert)

把一个给定的字符串s插入到Trie中

查询(query)

查询Trie中是否存在给定的字符串s，返回数量

前缀(start_with)

查询Trie中是否存在字符串以给定的字符串s为前缀，返回数量

静态实现

需要根据实际问题中字符出现的种类数目，修改 $ALPHABET\_SIZE$ ，一般都取26。

#include <iostream>

using namespace std;

const int N = 1e5 + 7;
const int M = N * L;   // N是字符串个数, L是字符串平均长度
const int ALPHABETS_SIZE = 26;

struct Trie {
private:
    int son[M][ALPHABETS_SIZE], idx = 0;
    int cnt[N];
public:
    void insert(const string& s) {
        int p = 0;
        for (char c : s) {
            int t = c - 'a';
            if (!son[p][t]) son[p][t] = ++idx;
            p = son[p][t];
        }
        cnt[p]++;// how mant words
    }
    int query(const string& s) {
        int p = 0;
        for (char c : s) {
            int t = c - 'a';
            if(!son[p][t]) return 0;
            p = son[p][t];
        }
        return cnt[p];
    }
    int start_with(const string &s) {
        int res = 0;
        int p = 0;
        for (char c : s) {
            int t = c - 'a';
            if (!son[p][t]) break;
            p = son[p][t];
            res += cnt[p];
        }
        return res;
    }
};

复杂度分析

假设当前需要插入或查询的str[]的有效长度为 $L$ 。

时间复杂度： $insert()$ ： $O(L)$ ； $query()$ ： $O(L)$
空间复杂度： $O(M * ALPHABET\_SIZE)$ 。

动态实现

基于vector实现

需要根据实际问题中字符出现的种类数目，修改 $ALPHABET\_SIZE$ ，一般都取26。

#include <string>
#include <vector>

using namespace std;

const int ALPHABET_SIZE = 26;

struct Trie {
private:
    vector<Trie*> son;
    int cnt;
public:
    Trie() : cnt(0), son(ALPHABET_SIZE, nullptr) {}
    ~Trie() {
        for (auto &s : son) delete(s);
    }
    
    void insert(const string& s) {
        Trie* p = this;
        for (char c : s) {
            int t = c - 'a';
            if (!p->son[t]) p->son[t] = new Trie();
            p = p->son[t];
        }
        p->cnt++;
    }
    
    int query(const string & s) {
        Trie*p = this;
        for (char c : s) {
            int t = c - 'a';
            if (!p->son[t]) return 0;
            p = p->son[t];
        }
        return p->cnt;
    }
    
    int start_with(const string& s) {
        int res = 0;
        Trie* p = this;
        for (char c : s) {
            int t = c - 'a';
            if (!p->son[t]) break;
            p = p->son[t];
            res += p->cnt;
        }
        return res;
    }
};

基于unordered_map实现

能够动态调整字符种类的数目，无需把字符映射成数字，代码更简洁。

#include <string>
#include <unordered_map>

using namespace std;

struct Trie {
private:
    int cnt;
    unordered_map<char, Trie*> son;
public:
    Trie() : cnt(0) {}
    ~Trie() {
        for (auto &s : son) delete(s.second);
    }
    
    void insert(const string& s) {
        Trie* p = this;
        for (char c : s) {
            if (!p->son[c]) p->son[c] = new Trie();
            p = p->son[c];
        }
        p->cnt++;
    }
    
    int query(const string& s) {
        Trie*p = this;
        for (char c : s) {
            if (!p->son[c]) return 0;
            p = p->son[c];
        }
        return p->cnt;
    }
    
    int start_with(const string& s) {
        int res = 0; 
        Trie* p = this;
        for (char c : s) {
            if (!p->son[c]) break;
            p = p->son[c];
            res += p->cnt;
        }    
        return res;
    }
};

算法竞赛，时间复杂度是王道，因此个人习惯使用vector的版本：

	基于vector实现	基于unordered_map实现
时间	较快，比unordered_map版本接近快1倍	较慢，要处理hash冲突
空间	$ALPHABET\_SIZE$ 固定，会产生更多的内存碎片	内存碎片较少

复杂度分析

Trie的复杂度与字符串的个数和长度有关。假设Trie中包含 $N$ 个字符串，且平均字符串长度为 $L$ 。

时间复杂度： $insert()$ ： $O(L)$ ； $query()$ ： $O(L)$ ； $start\_with()$ ： $O(L)$ ，构建Trie为 $O(n*L)$
空间复杂度： $O(n*L)$ 。

练习eg：

Acwing-142.前缀统计

应用：

01-Trie(01字典树)

真神，不会被卡TLE;

const int N = 1e5 + 7, M = N * 31;
const int ALPHABETS_SIZE = 2;

struct Trie {
private:
    int son[M][ALPHABETS_SIZE], idx;
public:    
    void insert(int x) {
        int p = 0;
        for (int i = 30; i >= 0; i --) {
            int t = x >> i & 1;
            if (!son[p][t]) {
                son[p][t] = ++idx;
            }
            p = son[p][t];
        }
    }
    
    int query(int x) {
        int res = 0;
        int p = 0;
        for (int i = 30; i >= 0; i --) {
            int t = x >> i & 1;
            if (son[p][!t]) {
                p = son[p][!t];
                res = (res << 1) + !t;
            }
            else {
                p = son[p][t];
                res = (res << 1) + t;
            }
        }
        return res;
    }
};

这个应用应该整理一下：

Leetcode-792.匹配子序列的单词数

// query_dfs
// 枚举字典树每一层的可行路径 去匹配s, 而非去对s进行dfs
void query_dfs(Trie* p, string& s,int pos){
    // 当前位置存在结尾
	if (p->cnt) {
        ans += p->cnt;
        p->cnt = 0; // 置0十分重要，避免重复计算
    }
    for (int i = 0; i < 26; i ++) {
        // 存在子节点
        if (p->son[i]) {
            // 可以看作pos ~ next_pos之间的全失效
            int next_pos = s.find_first_of('a' + i, pos);
            if (next_pos != string::npos) {
                query_dfs(p->son[i], s, next_pos + 1);
            }
        }
    }
}

数据结构与算法

Algorithm Template C++

本博客所有文章均采用 CC BY-NC-SA 4.0 协议，禁止商用，转载请注明出处！

0x20字符串-(5)-AC自动机上一篇

0x20字符串-(3)-Manacher 下一篇