]>
Commit | Line | Data |
---|---|---|
b2441318 | 1 | // SPDX-License-Identifier: GPL-2.0 |
cae15db7 DT |
2 | #include <string.h> |
3 | #include "util.h" | |
4 | #include "debug.h" | |
5 | ||
6 | #include "demangle-rust.h" | |
7 | ||
8 | /* | |
9 | * Mangled Rust symbols look like this: | |
10 | * | |
11 | * _$LT$std..sys..fd..FileDesc$u20$as$u20$core..ops..Drop$GT$::drop::hc68340e1baa4987a | |
12 | * | |
13 | * The original symbol is: | |
14 | * | |
15 | * <std::sys::fd::FileDesc as core::ops::Drop>::drop | |
16 | * | |
17 | * The last component of the path is a 64-bit hash in lowercase hex, prefixed | |
18 | * with "h". Rust does not have a global namespace between crates, an illusion | |
19 | * which Rust maintains by using the hash to distinguish things that would | |
20 | * otherwise have the same symbol. | |
21 | * | |
22 | * Any path component not starting with a XID_Start character is prefixed with | |
23 | * "_". | |
24 | * | |
25 | * The following escape sequences are used: | |
26 | * | |
27 | * "," => $C$ | |
28 | * "@" => $SP$ | |
29 | * "*" => $BP$ | |
30 | * "&" => $RF$ | |
31 | * "<" => $LT$ | |
32 | * ">" => $GT$ | |
33 | * "(" => $LP$ | |
34 | * ")" => $RP$ | |
35 | * " " => $u20$ | |
36 | * "'" => $u27$ | |
37 | * "[" => $u5b$ | |
38 | * "]" => $u5d$ | |
39 | * "~" => $u7e$ | |
40 | * | |
41 | * A double ".." means "::" and a single "." means "-". | |
42 | * | |
43 | * The only characters allowed in the mangled symbol are a-zA-Z0-9 and _.:$ | |
44 | */ | |
45 | ||
46 | static const char *hash_prefix = "::h"; | |
47 | static const size_t hash_prefix_len = 3; | |
48 | static const size_t hash_len = 16; | |
49 | ||
50 | static bool is_prefixed_hash(const char *start); | |
51 | static bool looks_like_rust(const char *sym, size_t len); | |
52 | static bool unescape(const char **in, char **out, const char *seq, char value); | |
53 | ||
54 | /* | |
55 | * INPUT: | |
56 | * sym: symbol that has been through BFD-demangling | |
57 | * | |
58 | * This function looks for the following indicators: | |
59 | * | |
60 | * 1. The hash must consist of "h" followed by 16 lowercase hex digits. | |
61 | * | |
62 | * 2. As a sanity check, the hash must use between 5 and 15 of the 16 possible | |
63 | * hex digits. This is true of 99.9998% of hashes so once in your life you | |
64 | * may see a false negative. The point is to notice path components that | |
65 | * could be Rust hashes but are probably not, like "haaaaaaaaaaaaaaaa". In | |
66 | * this case a false positive (non-Rust symbol has an important path | |
67 | * component removed because it looks like a Rust hash) is worse than a | |
68 | * false negative (the rare Rust symbol is not demangled) so this sets the | |
69 | * balance in favor of false negatives. | |
70 | * | |
71 | * 3. There must be no characters other than a-zA-Z0-9 and _.:$ | |
72 | * | |
73 | * 4. There must be no unrecognized $-sign sequences. | |
74 | * | |
75 | * 5. There must be no sequence of three or more dots in a row ("..."). | |
76 | */ | |
77 | bool | |
78 | rust_is_mangled(const char *sym) | |
79 | { | |
80 | size_t len, len_without_hash; | |
81 | ||
82 | if (!sym) | |
83 | return false; | |
84 | ||
85 | len = strlen(sym); | |
86 | if (len <= hash_prefix_len + hash_len) | |
87 | /* Not long enough to contain "::h" + hash + something else */ | |
88 | return false; | |
89 | ||
90 | len_without_hash = len - (hash_prefix_len + hash_len); | |
91 | if (!is_prefixed_hash(sym + len_without_hash)) | |
92 | return false; | |
93 | ||
94 | return looks_like_rust(sym, len_without_hash); | |
95 | } | |
96 | ||
97 | /* | |
98 | * A hash is the prefix "::h" followed by 16 lowercase hex digits. The hex | |
99 | * digits must comprise between 5 and 15 (inclusive) distinct digits. | |
100 | */ | |
101 | static bool is_prefixed_hash(const char *str) | |
102 | { | |
103 | const char *end; | |
104 | bool seen[16]; | |
105 | size_t i; | |
106 | int count; | |
107 | ||
108 | if (strncmp(str, hash_prefix, hash_prefix_len)) | |
109 | return false; | |
110 | str += hash_prefix_len; | |
111 | ||
112 | memset(seen, false, sizeof(seen)); | |
113 | for (end = str + hash_len; str < end; str++) | |
114 | if (*str >= '0' && *str <= '9') | |
115 | seen[*str - '0'] = true; | |
116 | else if (*str >= 'a' && *str <= 'f') | |
117 | seen[*str - 'a' + 10] = true; | |
118 | else | |
119 | return false; | |
120 | ||
121 | /* Count how many distinct digits seen */ | |
122 | count = 0; | |
123 | for (i = 0; i < 16; i++) | |
124 | if (seen[i]) | |
125 | count++; | |
126 | ||
127 | return count >= 5 && count <= 15; | |
128 | } | |
129 | ||
130 | static bool looks_like_rust(const char *str, size_t len) | |
131 | { | |
132 | const char *end = str + len; | |
133 | ||
134 | while (str < end) | |
135 | switch (*str) { | |
136 | case '$': | |
137 | if (!strncmp(str, "$C$", 3)) | |
138 | str += 3; | |
139 | else if (!strncmp(str, "$SP$", 4) | |
140 | || !strncmp(str, "$BP$", 4) | |
141 | || !strncmp(str, "$RF$", 4) | |
142 | || !strncmp(str, "$LT$", 4) | |
143 | || !strncmp(str, "$GT$", 4) | |
144 | || !strncmp(str, "$LP$", 4) | |
145 | || !strncmp(str, "$RP$", 4)) | |
146 | str += 4; | |
147 | else if (!strncmp(str, "$u20$", 5) | |
148 | || !strncmp(str, "$u27$", 5) | |
149 | || !strncmp(str, "$u5b$", 5) | |
150 | || !strncmp(str, "$u5d$", 5) | |
151 | || !strncmp(str, "$u7e$", 5)) | |
152 | str += 5; | |
153 | else | |
154 | return false; | |
155 | break; | |
156 | case '.': | |
157 | /* Do not allow three or more consecutive dots */ | |
158 | if (!strncmp(str, "...", 3)) | |
159 | return false; | |
160 | /* Fall through */ | |
161 | case 'a' ... 'z': | |
162 | case 'A' ... 'Z': | |
163 | case '0' ... '9': | |
164 | case '_': | |
165 | case ':': | |
166 | str++; | |
167 | break; | |
168 | default: | |
169 | return false; | |
170 | } | |
171 | ||
172 | return true; | |
173 | } | |
174 | ||
175 | /* | |
176 | * INPUT: | |
177 | * sym: symbol for which rust_is_mangled(sym) returns true | |
178 | * | |
179 | * The input is demangled in-place because the mangled name is always longer | |
180 | * than the demangled one. | |
181 | */ | |
182 | void | |
183 | rust_demangle_sym(char *sym) | |
184 | { | |
185 | const char *in; | |
186 | char *out; | |
187 | const char *end; | |
188 | ||
189 | if (!sym) | |
190 | return; | |
191 | ||
192 | in = sym; | |
193 | out = sym; | |
194 | end = sym + strlen(sym) - (hash_prefix_len + hash_len); | |
195 | ||
196 | while (in < end) | |
197 | switch (*in) { | |
198 | case '$': | |
199 | if (!(unescape(&in, &out, "$C$", ',') | |
200 | || unescape(&in, &out, "$SP$", '@') | |
201 | || unescape(&in, &out, "$BP$", '*') | |
202 | || unescape(&in, &out, "$RF$", '&') | |
203 | || unescape(&in, &out, "$LT$", '<') | |
204 | || unescape(&in, &out, "$GT$", '>') | |
205 | || unescape(&in, &out, "$LP$", '(') | |
206 | || unescape(&in, &out, "$RP$", ')') | |
207 | || unescape(&in, &out, "$u20$", ' ') | |
208 | || unescape(&in, &out, "$u27$", '\'') | |
209 | || unescape(&in, &out, "$u5b$", '[') | |
210 | || unescape(&in, &out, "$u5d$", ']') | |
211 | || unescape(&in, &out, "$u7e$", '~'))) { | |
212 | pr_err("demangle-rust: unexpected escape sequence"); | |
213 | goto done; | |
214 | } | |
215 | break; | |
216 | case '_': | |
217 | /* | |
218 | * If this is the start of a path component and the next | |
219 | * character is an escape sequence, ignore the | |
220 | * underscore. The mangler inserts an underscore to make | |
221 | * sure the path component begins with a XID_Start | |
222 | * character. | |
223 | */ | |
224 | if ((in == sym || in[-1] == ':') && in[1] == '$') | |
225 | in++; | |
226 | else | |
227 | *out++ = *in++; | |
228 | break; | |
229 | case '.': | |
230 | if (in[1] == '.') { | |
231 | /* ".." becomes "::" */ | |
232 | *out++ = ':'; | |
233 | *out++ = ':'; | |
234 | in += 2; | |
235 | } else { | |
236 | /* "." becomes "-" */ | |
237 | *out++ = '-'; | |
238 | in++; | |
239 | } | |
240 | break; | |
241 | case 'a' ... 'z': | |
242 | case 'A' ... 'Z': | |
243 | case '0' ... '9': | |
244 | case ':': | |
245 | *out++ = *in++; | |
246 | break; | |
247 | default: | |
248 | pr_err("demangle-rust: unexpected character '%c' in symbol\n", | |
249 | *in); | |
250 | goto done; | |
251 | } | |
252 | ||
253 | done: | |
254 | *out = '\0'; | |
255 | } | |
256 | ||
257 | static bool unescape(const char **in, char **out, const char *seq, char value) | |
258 | { | |
259 | size_t len = strlen(seq); | |
260 | ||
261 | if (strncmp(*in, seq, len)) | |
262 | return false; | |
263 | ||
264 | **out = value; | |
265 | ||
266 | *in += len; | |
267 | *out += 1; | |
268 | ||
269 | return true; | |
270 | } |