字符串池化解析：以Yosys IdString为例

13.02.2026 00:00 c++ eda

问题背景

在C++中，两个内容相同的字符串std::string是不同的对象：

1std::string s1 = "clk";
2std::string s2 = "clk";
3s1 == s2;  // true，但O(n)逐字符比较
4&s1[0] != &s2[0];  // 不同内存地址

比较开销： O(n)逐字符比较
内存冗余：相同字符串多份拷贝
哈希表性能：重复计算哈希值
缓存不友好：分散存储，cache miss

Yosys IdString的解决方案

IdString本质上是一个int

1struct IdString {
2    int index_;  // 4字节ID代表字符串的身份
3    
4    // O(1)比较
5    bool operator==(const IdString &rhs) const { 
6        return index_ == rhs.index_; 
7    }
8};

全局存储架构

 1
 2// 实际字符串存储
 3static std::vector<char*> global_id_storage_;
 4// 索引 -> 字符串指针
 5// [0] -> "" 
 6// [1] -> "$auto$1"
 7// [2] -> "\\clk" 
 8// [3] -> "$and"
 9
10// 字符串到索引映射
11static dict<char*, int> global_id_index_;
12// "$auto$1" -> 1
13// "\\clk"   -> 2
14
15//引用计数
16static std::vector<int> global_refcount_storage_;
17// [1] -> 5   (5个地方引用"$auto$1")
18// [2] -> 100 (100个地方引用"clk")
19
20//空闲索引复用
21static std::vector<int> global_free_idx_list_;
22// [42, 57, 83]  // 这些索引可以复用

String Interning

 1static int get_reference(const char *p)
 2{
 3    log_assert(destruct_guard_ok);
 4
 5    if (!p[0])
 6        return 0;
 7
 8    // 查找是否已存在
 9    auto it = global_id_index_.find((char*)p);
10    if (it != global_id_index_.end()) {
11        // 已存在：增加引用计数，返回索引
12        global_refcount_storage_.at(it->second)++;
13        return it->second;
14    }
15
16    log_assert(p[0] == '$' || p[0] == '\\');
17    log_assert(p[1] != 0);
18    for (const char *c = p; *c; c++)
19        if ((unsigned)*c <= (unsigned)' ')
20            log_error("Found control character or space (0x%02x) in string '%s'"
21                      " which is not allowed in RTLIL identifiers\n", *c, p);
22
23    //分配新索引
24    int idx;
25    if (global_free_idx_list_.empty()) {
26        // 无空闲索引：扩容
27        idx = global_id_storage_.size();
28        global_id_storage_.push_back(nullptr);
29        global_refcount_storage_.push_back(0);
30    } else {
31        // 复用空闲索引
32        idx = global_free_idx_list_.back();
33        global_free_idx_list_.pop_back();
34    }
35
36    // 存储字符串
37    global_id_storage_.at(idx) = strdup(p);
38    global_id_index_[global_id_storage_.at(idx)] = idx;
39    global_refcount_storage_.at(idx) = 1;
40
41    return idx;
42}

引用计数生命周期管理

 1// 获取引用
 2static inline int get_reference(int idx)
 3{
 4    if (idx) {
 5        global_refcount_storage_[idx]++;
 6    }
 7    return idx;
 8}
 9
10// 释放引用
11static inline void put_reference(int idx)
12{
13    if (!destruct_guard_ok || !idx)
14        return;
15
16    int &refcount = global_refcount_storage_[idx];
17
18    if (--refcount > 0)
19        return;  // 还有引用，不释放
20
21    // 引用归零：释放字符串，回收索引
22    log_assert(refcount == 0);
23    free_reference(idx);
24}
25
26static inline void free_reference(int idx)
27{
28    // 从哈希表移除
29    global_id_index_.erase(global_id_storage_.at(idx));
30    // 释放C字符串
31    free(global_id_storage_.at(idx));
32    global_id_storage_.at(idx) = nullptr;
33    // 索引加入空闲列表
34    global_free_idx_list_.push_back(idx);
35}

析构保护机制

1// 防止静态析构顺序问题
2static bool destruct_guard_ok;  // POD，初始化为false
3
4static struct destruct_guard_t {
5    destruct_guard_t() { destruct_guard_ok = true; }
6    ~destruct_guard_t() { destruct_guard_ok = false; }
7} destruct_guard;

1// 问题场景：全局IdString在静态析构时
2static IdString g_clk("\\clk");  // 全局对象
3
4// main结束后析构顺序不确定：
5// 如果global_refcount_storage_先析构，g_clk析构时访问已释放内存
6// destruct_guard在global_refcount_storage_之后析构，保护机制生效

Sticky IDs优化

 1#ifdef YOSYS_USE_STICKY_IDS
 2static int last_created_idx_ptr_;
 3static int last_created_idx_[8];  // 环形缓冲区
 4#endif
 5
 6// 在get_reference中：
 7#ifdef YOSYS_USE_STICKY_IDS
 8// 避免 Create->Delete->Create 模式
 9// 场景：临时IdString创建后立即销毁，然后再次创建相同字符串
10// 问题：索引被回收又分配，导致缓存失效
11// 解决：保留最近创建的8个索引不回收
12
13if (last_created_idx_[last_created_idx_ptr_])
14    put_reference(last_created_idx_[last_created_idx_ptr_]);
15last_created_idx_[last_created_idx_ptr_] = idx;
16get_reference(last_created_idx_[last_created_idx_ptr_]);
17last_created_idx_ptr_ = (last_created_idx_ptr_ + 1) & 7;
18#endif

IdString 相比 std::string 的优势

性能比较

场景	IdString	std::string
对象大小	4 字节	32 字节
字符串存储	全局唯一存储，无重复	每个 string 独立管理或共享
构造（已存在）	O(1) 哈希查找	O(n) 拷贝
比较	O(1) 整数比较	O(n) 逐字符
哈希	O(1) 预计算	O(n) 实时计算
拷贝	O(1) 整数拷贝	O(n) 深拷贝或引用计数
析构	O(1)	取决于实现

缓存局部性

1// IdString：索引访问连续内存，缓存友好
2for (auto id : id_list) {
3    const char* str = global_id_storage_[id.index_];  // 顺序访问
4}
5
6// std::string：指针跳跃，缓存不友好
7for (const auto& s : string_list) {
8    // s.data() 指向堆上随机位置
9}

底层用 `char*` 而非 `std::string`的优势

char* + strdup 实现隐式去重

 1static dict<char*, int> global_id_index_;  // char* → 索引
 2static std::vector<char*> global_id_storage_;  // 存储
 3
 4// 插入时
 5char* dup = strdup(str);  // 新分配
 6if (global_id_index_.count(dup)) {
 7    free(dup);  // 已存在，释放重复
 8    return existing_index;
 9}
10// 新字符串保留，地址唯一

利用 strdup 的堆分配特性——相同内容的字符串若已存在，其指针地址必然相同，哈希表可直接用指针比较替代字符串比较，达到 O(1) 的查找和比较性能。

考量	`char*`	`std::string`
内存开销	8 字节指针	≥32 字节（SSO 除外）
哈希表键比较	指针直接比较（地址相同即相等）	逐字符比较或自定义哈希
去重机制	`strdup` 保证相同内容相同地址	需额外处理

应用场景

不适用场景

字符串基本唯一，池化引入额外开销却无去重收益
比较操作极少，池化的查找成本无法摊平
工具运行时间短，构造和销毁池的开销占比过高
多线程高并发，全局池成为锁竞争热点，反而降低性能

字符串池化解析：以Yosys IdString为例

问题背景

Yosys IdString的解决方案

全局存储架构

String Interning

引用计数生命周期管理

析构保护机制

Sticky IDs优化

IdString 相比 std::string 的优势

性能比较

缓存局部性

底层用 char* 而非 std::string的优势

应用场景

推荐场景

不适用场景

底层用 `char*` 而非 `std::string`的优势