1 //! Sanitise names for use in file systems and the likes.
3 //! The output string is guaranteed to be shorter than or equal to the input string in length,
4 //! except for file names that are reserved on Windows (see [`Options::windows_safe`]), in which
5 //! case an underscore is appended to the base name (e.g. NUL → NUL_, aux.h → aux_.h).
7 //! The key parts of the API:
9 #![cfg_attr(feature = "alloc", doc = "\
10 - <code>[sanitise][](input: &str) -> String</code>: the simplest thing to call;
12 - <code>[sanitise_with_options][](input: &str, options: &Options<_>) -> String</code>:
13 when you want to tweak the nature of the sanitisation; and")]
14 #![cfg_attr(not(feature = "alloc"), doc = "\
15 - <s><code>[sanitise][](input: &str) -> String</code>: the simplest thing to call</s>
16 *(disabled in this build due to compiling without the `alloc` feature)*;
18 - <s><code>[sanitise_with_options][](input: &str, options: &Options<_>) -> String</code>:
19 when you want to tweak the nature of the sanitisation</s> *(disabled in this build due to
20 compiling without the `alloc` feature)*; and")]
22 //! - [`Options`], with detailed descriptions of each option.
24 //! And for advanced users that want to control allocations or other similar things:
26 //! - <code>[sanitise_to][](input: &str, options: &Options<_>, out: &mut <em>String</em>)</code>,
28 #![cfg_attr(feature = "alloc", doc = " a `String`")]
29 #![cfg_attr(not(feature = "alloc"), doc = " a string")]
30 #![cfg_attr(feature = "tinyvec_string", doc = " or [`tinyvec_string::ArrayString`]")]
31 #![cfg_attr(all(docsrs, feature = "tinyvec_string"), doc = " (when enabled)")]
33 //! for which the following methods may help:
35 //! - <code>[max_alloc_size][](options: &Options<_>)</code> or
36 //! <code>[max_alloc_size_const][](options: &Options<Option<char>>)</code>,
37 //! to suggest a size for scratch buffer
38 #![cfg_attr(feature = "tinyvec_string", doc = " or `ArrayString`")]
41 //! - <code>[sufficient_alloc_size][](input: &str, options: &Options<_>) -> usize</code>, to
42 //! suggest a size that will definitely be sufficient for one given input (mainly useful when you
43 //! are crafting a path with stuff before and after it).
45 //! … but that’s dangerous territory, deep rabbit holes; ask if you actually *need* them—don’t be
46 //! like me. (When I am laid in earth, may my wrongs create no trouble in thy breast. Remember me,
47 //! but ah! forget my fate.)
49 //! ### Conditional compilation/Cargo features
51 //! This crate has several features:
53 //! - **std**, enabled by default. Implies *alloc*. Disable it to get `#![no_std]` operation.
55 //! - **alloc**, enabled by default via *std*. Provides the ability to sanitise to a `String` in
56 //! `sanitise_to`, and the `sanitise` and `sanitise_with_options` functions.
58 //! - **tinyvec_string**, disabled by default. Provides the ability to sanitise to
59 //! `tinyvec_string::ArrayString`, which works without *alloc*.
61 //! - **const-fn-trait-bound**, disabled by default, requires rustc nightly at the time of writing.
62 //! Makes [`max_alloc_size`] const.
64 //! These docs were built with these features enabled:
65 #![cfg_attr(feature = "std", doc = " <span class='stab portability'><code>std</code></span>")]
66 #![cfg_attr(feature = "alloc", doc = " <span class='stab portability'><code>alloc</code></span>")]
67 #![cfg_attr(feature = "tinyvec_string", doc = " <span class='stab portability'><code>tinyvec_string</code></span>")]
68 #![cfg_attr(feature = "const-fn-trait-bound", doc = " <span class='stab portability'><code>const-fn-trait-bound</code></span>")]
72 not(feature
= "alloc"),
73 not(feature
= "tinyvec_string"),
74 not(feature
= "const-fn-trait-bound"),
76 doc
= " *(none of them)*")]
78 //! … and these features disabled:
79 #![cfg_attr(not(feature = "std"), doc = " <span class='stab portability'><code>std</code></span>")]
80 #![cfg_attr(not(feature = "alloc"), doc = " <span class='stab portability'><code>alloc</code></span>")]
81 #![cfg_attr(not(feature = "tinyvec_string"), doc = " <span class='stab portability'><code>tinyvec_string</code></span>")]
82 #![cfg_attr(not(feature = "const-fn-trait-bound"), doc = " <span class='stab portability'><code>const-fn-trait-bound</code></span>")]
87 feature
= "tinyvec_string",
88 feature
= "const-fn-trait-bound",
90 doc
= " *(none of them)*")]
94 #![cfg_attr(not(feature = "std"), no_std)]
95 #![cfg_attr(feature = "const-fn-trait-bound", feature(const_fn_trait_bound))]
96 #![cfg_attr(docsrs, feature(doc_cfg))]
97 #![cfg_attr(not(feature = "alloc"), allow(rustdoc::broken_intra_doc_links))] // I’m lazy.
99 #[cfg(feature = "alloc")]
101 #[cfg(feature = "alloc")]
102 use alloc
::string
::String
;
104 use core
::ops
::{Deref
, Index
, Range
, RangeFrom
, RangeBounds
};
106 /// Sanitisation options. Defaults marked on each field.
108 /// Take a look around, but I think everything’s pretty sane by default; the ones I think you’re
109 /// most likely to want to change are `url_safe` and `windows_safe`, though `replace_with`,
110 /// `collapse_replacements` and `six_measures_of_barley` can be interesting too for yielding
111 /// prettier results.
113 /// If you set `length_limit` to `usize::MAX`, all the bool fields to `false`, and
114 /// `six_measures_of_barley` to an empty string, `sanitise` will not alter the input string in any
115 /// way. But that would be a rather expensive alternative to `.clone()`. In practice, I doubt you
116 /// ever want to disable `most_fs_safe`, which is a good baseline.
118 pub struct Options
<R
: Replace
> {
119 /// Limit the complete file name to this many UTF-8 code units. The default is **255**, which
120 /// is suitable for all practical platforms.
122 /// (Some file systems limit lengths in UTF-8 code units and some in UTF-16 code units, but
123 /// UTF-16 never takes more code units than UTF-8 to encode a given Unicode string, so we can
126 /// Reasons you might want to reduce it:
128 /// 1. You haven’t appended the extension yet, and so want to subtract the extension’s length.
129 /// (In that case I suggest writing `Options::DEFAULT.length_limit` instead of hard coding
130 /// 255—that’ll work in const context.)
132 /// 2. You want smoother Windows support, for on Windows some things start falling over if the
133 /// total path length is greater than 260 characters; so measuring or estimating the path
134 /// length could potentially be useful—but unless you know, probably don’t worry too much,
135 /// someone’ll probably drop it deep in a node_modules tree at some point and then you’ll be
136 /// in trouble anyway. 😀 <!-- Okay, okay, so node_modules trees aren’t typically flattened
137 /// almost entirely these days; but let me have my joke, please? -->
139 /// One other mildly significant note here: if you care about Apple’s pre-2017 HFS+ file
140 /// system, you should perform Unicode normalisation to NFD (most likely via the
141 /// `unicode-normalization` crate) before performing sanitisation, because the decomposed form
142 /// may be longer; if you don’t, then the path will be normalised to NFD by the file system
143 /// when you try to write it, which could take it over 255 and make it fail. I don’t think
144 /// there are any popular file systems that normalise any more, though APFS kinda prefers NFC,
145 /// so you might want to normalise to NFC. I do not know if normalising to NFC will ever
146 /// lengthen a UTF-8 string, but the spec allows it to (UAX #15, goal 3.2).
148 /// The minimum permitted value is 10, for reasons of implementation convenience and because I
149 /// don’t think there’s any legitimate use case for a smaller value. If you provide a value
150 /// less than ten, you’ll get an empty string back every time.
152 /// Truncations are performed at `char` granularity (Unicode scalar value), which means that
153 /// extended grapheme clusters could be broken. This could change in the future (it’ll be an
154 /// optional dependency on `unicode-segmentation`), but for now it was just too much thought.
155 /// If I ever implement this, I’ll probably ditch the minimum value of 10 too.
156 // (Most significantly, it doesn’t play terribly nicely with extension cleverness: six would no
157 // longer be sufficient to guarantee a base name, so more involved calculations and overflow
158 // tracking would need to be done. It’s perfectly achievable, but painful.)
159 pub length_limit
: usize,
161 /// When allocating the string (since it allocates as small a string as possible), reserve at
162 /// least this many extra bytes. This is good for efficiency when you append the extension
163 /// after sanitisation (in which case, also disable `extension_cleverness`). Default **0**.
164 pub reserve_extra
: usize,
166 /// Make other options try to be clever about a file extension in the input. Default `true`.
168 /// Specifically, if a file extension is detected (done by looking for the last full stop in
169 /// the name, and splitting at that point into base name and extension):
171 /// 1. `length_limit` will try to keep the extension intact, truncating the base name rather
172 /// than the extension. “Try”, because if the extension is longer than six code units less
173 /// than the length limit, it will be deemed unsalvageable. (Why six? The base name must
174 /// retain at least one character, so for convenience that’s four UTF-8 code units, plus one
175 /// more for the dot, and if `windows_safe` is on, the longest reserved name causes a five
176 /// code unit base name like `LPT1_`, and ridiculously long extensions are a corner case
177 /// anyway so I decided to just call it a day at six. If I subsequently implement
178 /// grapheme-cluster-aware truncation, this six will increase if the first grapheme cluster
179 /// in the base name is more than five code units long.) An unsalvageable extension is the
180 /// only case where sanitisation may take two steps to quiesce, rather than one: if the
181 /// extension is entirely truncated and the base name contains a dot which in a subsequent
182 /// run will be interpreted as the extension separator, trimming will happen around it on
183 /// that subsequent run but not the first.
185 /// 2. `windows_safe` will detect reserved names with extensions.
187 /// 3. `trim_spaces_and_full_stops` and `trim_more_punctuation` will trim those characters from
188 /// the end of the base name and the start of the extension, in addition to the start and
189 /// end of the full name. (Expressed otherwise, the base name and extension will be trimmed
192 /// If you’re appending the extension after sanitisation, you should turn this to false.
193 pub extension_cleverness
: bool
,
195 /// Remove characters that are not safe on just about any file system. Default `true`, and if
196 /// you actually want to disable it you’re probably using the wrong crate.
198 /// This plus `length_limit` is enough to satisfy most platforms other than Windows, though
199 /// cleaning somewhat more is probably a good idea.
201 /// Characters removed:
204 /// - ␀ (null, character zero)
206 /// Also disallows names comprising exclusively dots (`"."`, `".."`, `"..."`, *&c.*), NOT using
207 /// `replace_with` on them but yielding an empty string.
209 /// This is a tiny subset of `windows_safe`.
210 pub most_fs_safe
: bool
,
212 /// Ensure the file name is safe on Windows. Default `true`.
214 /// [These are the rules applied:](https://docs.microsoft.com/en-au/windows/win32/fileio/naming-a-file#naming-conventions)
216 /// - These characters are removed (and `replace_with` employed):
218 /// - `<` (less than)
219 /// - `>` (greater than)
221 /// - `"` (double quote)
222 /// - `/` (forward slash)
223 /// - `\` (backslash)
224 /// - `|` (vertical bar/pipe)
225 /// - `?` (question mark)
227 /// - The C0 control characters, 0–31 and 127 (U+0000–U+001F, U+007F); note that U+007F isn’t
228 /// actually part of C0, but Microsoft included it in this list so I do too.
230 /// - Names must not end with a space or a dot (so these are removed recursively—for reasons of
231 /// technical convenience, `replace_with` is NOT employed).
233 /// - These names are reserved (and so a trailing underscore is added to the base name),
234 /// including with an extension if `extension_cleverness` is enabled:
236 /// - CON, PRN, AUX, NUL,
237 /// - COM1, COM2, COM3, COM4, COM5, COM6, COM7, COM8, COM9,
238 /// - LPT1, LPT2, LPT3, LPT4, LPT5, LPT6, LPT7, LPT8, and LPT9
240 /// Most of these restrictions are actually not quite universal in Windows, but getting around
241 /// them requires switching into POSIX mode or using long UNC paths (e.g. `\\.\C:\CON`,
242 /// `\\?\D:\aux.h`), and your life will certainly be miserable if you try using them; so
243 /// they’re all considered not Windows-safe.
244 pub windows_safe
: bool
,
246 /// Remove characters that may be problematic in the usual places in URLs. Default `false`.
248 /// If you want something URL-safe, consider slugifying instead (see below).
250 /// This removes any character that is not what’s called a [*URL code point*], also removes the
251 /// characters `&`, `/` and `?`, and forbids the names `.` and `..` which have a special
252 /// meaning in paths. The result is either an empty string, or suitable for use as a path
253 /// component, query string value or fragment, without generally *needing* percent-encoding:
254 /// such a URL will be correctly parsed by a WHATWG URL Standard parser, though nominally
255 /// invalid¹, but older or poorer-quality URL parsers may need percent-encoding to cope with
256 /// the non-ASCII that is retained.
258 /// Some notable characters that are removed: `/`, `\`, `%`, `?`, `#`, `&`, `"`, and space.
260 /// Almost all non-ASCII is retained.
262 /// Notes on using these URLs in some common formats:
264 /// - In HTML, no escaping is needed in `<a href="http://www.example/fïle_ñamê">`, because `&`
265 /// and `"` are the only two characters needing escaping in a double-quoted attribute value,
266 /// and both are removed by `url_safe`.
268 /// - In plain text formats following the longstanding convention of angle bracket delimition
269 /// (`<http://www.example/lïke_τhis>`), no escaping should be required as `>` is removed by
270 /// `url_safe`. This includes Markdown. However, some such parsers could be stricter about
271 /// what’s allowed inside the angle brackets, so you may need or want to use a URL Standard
272 /// serialiser to do percent-encoding of the non-ASCII.
274 /// - In Markdown `[text](href)` links, you’ll want to manually percent-encode `(` to `%28` and
275 /// `)` to `%29`. This is yet another bad choice in Markdown’s technical foundation:
276 /// parentheses aren’t percent-encoded, never have been; so using a URL Standard serialiser
277 /// won’t help you, you’ll instead need to manually encode them, or unpaired parentheses will
278 /// break the link and possibly eat your laundry².
280 /// Given that this produces nominally-invalid URLs, you may be wondering why to bother at all;
281 /// it really comes down to characters like `?`, `/` and `#`: you *can* include them in paths
282 /// by percent-encoding, but it’s too likely that *somewhere* along the way, *something* will
283 /// mangle your path, not encoding it properly, and everything will break—basically the entire
284 /// *system* has to process the URL correctly; ever tried a path component containing `%2F`?
285 /// But if you’ve removed the genuinely problematic characters, then in theory things can no
286 /// longer go wrong once you’re past the parser. And being able to skip percent-encoding your
287 /// URLs when you know you’ll be using a proper URL parser is nice.
289 /// I deliberately haven’t provided an option for removing characters that would make a URL
290 /// nominally invalid (which is “non-ASCII”), because I think that goes too far: in such a
291 /// case, I don’t think you should *strip* such characters, but rather slugify the whole thing
292 /// (which can do things like `Voilà!` → `voila`).
294 /// `replace_with` is used for the character removals, but NOT for the forbidding of the names
295 /// `"."` and `".."`, for which it will instead yield an empty string.
299 /// ¹ “Invalid” is just a label in WHATWG specs; it doesn’t change anything, and parsing is
300 /// still well-defined, it’s generally just a hint that either you may have made a mistake,
301 /// or that older tools might not handle this case the same way.
303 /// ² When Americans say “eat your laundry” they mean the *clothes*. An Australian seeking to
304 /// express *that* concept would say “eat your washing” (and probably be looked at strangely
305 /// because it’s not an expression in common use). The laundry is the room in which clothes
306 /// are washed; so when I say injection attacks might eat your laundry——
308 /// [*URL code point*]: https://url.spec.whatwg.org/#url-code-points
311 /// Replace all sequences of whitespace with one space. Default `true`.
313 /// This uses the Unicode `White_Space` property to decide ([`char::is_whitespace`]).
315 /// This is done in two phases:
317 /// 1. Before safety character replacements, each whitespace character is normalised to a
318 /// U+0020 SPACE; `replace_with` is not invoked.
320 /// 2. After all character replacements, adjacent spaces (including any produced by
321 /// `replace_with`, independent of `collapse_replacements`) are collapsed to just one.
322 pub normalise_whitespace
: bool
,
324 /// Remove spaces and full stops (`.`) from the start and end of the name. Default `true`.
326 /// `normalise_whitespace` is performed before this; with it on, this will trim all whitespace,
327 /// with it off it’ll only trim U+0020 SPACE.
329 /// All things that invoke `replace_with` are performed before this; thus, if you replace a
330 /// character with a space or full stop, that could get trimmed. `replace_with` is not invoked
331 /// on any characters removed by this.
333 /// If `extension_cleverness` is enabled (which it is by default), on names with an extension
334 /// this trims from the start and end of the base name and extension independently, rather than
335 /// just the start and end of the full string. That is, `" foo . bar . baz "` will become
336 /// `"foo . bar.baz"` with `extension_cleverness`, and `"foo . bar . baz"` without.
338 /// This is independent of `windows_safe`, which also trims trailing spaces and dots from the
340 // BTW: U+002E is named FULL STOP, oh uncouth Americans. 😀
341 pub trim_spaces_and_full_stops
: bool
,
343 /// Remove a few more punctuationy characters from the start and end of the name.
346 /// This is a more aggressive supplement to `trim_spaces_and_full_stops`, trimming from the
347 /// same places in the same way. These characters are removed:
349 /// - `_` (underscore; especially significant because `replace_with` defaults to an underscore)
350 /// - `-` (hyphen/dash/minus)
352 /// - `;` (semicolon)
353 pub trim_more_punctuation
: bool
,
355 /// Remove control characters. Default `true`.
357 /// This removes all characters with the general category *Control*: C0 controls U+0000–U+001F,
358 /// control character U+007F, and C1 controls U+0080–U+009F.
360 /// `replace_with` is invoked on these removals.
361 pub remove_control_characters
: bool
,
363 /// Remove BiDi control characters that are relevant to reordering attacks. Default `true`.
365 /// <https://trojansource.codes/trojan-source.pdf> is a paper with info about the attack.
367 /// This removes U+202A–U+202E and U+2066–U+2069. It does NOT remove the remaining three
368 /// Bidi_Control characters U+061C, U+200E and U+200F (ALM, LRM, RLM),
369 /// which are not implicated in the attack and are conceivably useful in file names.
371 /// `replace_with` is invoked on these removals.
372 pub remove_reordering_characters
: bool
,
374 /// Where characters are removed (except as marked), replace them with this.
375 /// Default `Some('_')`.
377 /// If you provide a character that would normally be removed, it will not be removed: that
378 /// processing is done once only.
380 /// If you provide a character that would be trimmed, it may or may not be trimmed: end matches
381 /// will be trimmed, start matches only will be if ridiculously long names and/or extensions
382 /// force unusual truncation, exposing the start of the string (so that it gets trimmed to
386 /// Where multiple adjacent characters are to be replaced, only replace the first, and remove
387 /// any subsequent ones. Default `false`.
389 /// See also `normalise_whitespace`, which can collapse replacements if you replace with
391 pub collapse_replacements
: bool
,
393 /// If sanitisation would leave the path empty, return this string instead. Default `"_"`.
395 /// This exists because I found myself writing `if name.is_empty() { name.push('_') }` after
396 /// every time I called `sanitise`. I think most of the time you don’t want to be left with an
397 /// empty string, and inserting *something* is tolerable, so this is on by default as something
398 /// fairly neutral that aligns with the `replace_with` default as well. You can effectively
399 /// disable this by setting this to an empty string.
401 /// `length_limit` is not taken into account on this. If you put something ridiculously long in
402 /// it, you brought it on yourself and I wash my hands of it, as Pontius Pilate of old.
404 /// (Read *Ruth 3:15–17* from the Bible to understand the name of this option.)
405 pub six_measures_of_barley
: &'
static str,
408 // Implemented on just one type for inference reasons. One might wonder why I use an associated
409 // constant at all. This would not be an unreasonable thing to wonder.
410 impl Options
<Option
<char>> {
411 /// The default options. This is more useful than `Options::default()` (which just returns
412 /// this) because it’s const, so you can access `Options::DEFAULT.length_limit` in const
414 pub const DEFAULT
: Self = Options
{
417 extension_cleverness
: true,
421 normalise_whitespace
: true,
422 trim_spaces_and_full_stops
: true,
423 trim_more_punctuation
: true,
424 remove_control_characters
: true,
425 remove_reordering_characters
: true,
426 replace_with
: Some('_'
),
427 collapse_replacements
: false,
428 six_measures_of_barley
: "_",
432 impl Default
for Options
<Option
<char>> {
433 fn default() -> Self {
438 impl<R
: Replace
> Options
<R
> {
439 /// A workaround for an otherwise-messy type situation with filling in defaults.
441 /// This solves the problem that you can’t write this:
444 /// Options { replace_with: |c| /* … */, ..Options::DEFAULT }
447 /// … because struct update syntax doesn’t currently allow you to change types, and
448 /// `Options::DEFAULT` is an `Options<Option<char>>`, but with a closure for `replace_with`
449 /// you’re needing to change it to `Options<[closure@…]>`. So instead, write like one of these:
452 /// Options::DEFAULT.with_replace_with(|c| /* … */)
453 /// Options { /* … */, ..Options::DEFAULT }.with_replace_with(|c| /* … */)
456 /// If you’re using nightly rustc, you can try the [incomplete type-changing-struct-update
457 /// feature](https://github.com/rust-lang/rust/issues/86555) instead, which lets the first code
458 /// work (so long as this unstable and incomplete feature is working):
461 /// #![feature(type_changing_struct_update)]
462 /// use sanitise_file_name::Options;
465 /// Options { replace_with: |c| /* … */, ..Options::DEFAULT }
468 pub fn with_replace_with
<R2
: Replace
>(self, new_replace_with
: R2
) -> Options
<R2
> {
470 length_limit
: self.length_limit
,
471 reserve_extra
: self.reserve_extra
,
472 extension_cleverness
: self.extension_cleverness
,
473 most_fs_safe
: self.most_fs_safe
,
474 windows_safe
: self.windows_safe
,
475 url_safe
: self.url_safe
,
476 normalise_whitespace
: self.normalise_whitespace
,
477 trim_spaces_and_full_stops
: self.trim_spaces_and_full_stops
,
478 trim_more_punctuation
: self.trim_more_punctuation
,
479 remove_control_characters
: self.remove_control_characters
,
480 remove_reordering_characters
: self.remove_reordering_characters
,
481 replace_with
: new_replace_with
,
482 collapse_replacements
: self.collapse_replacements
,
483 six_measures_of_barley
: self.six_measures_of_barley
,
488 /// See [`Options::replace_with`].
490 // “Why no *string* replacement?” I hear you ask.
491 // Because then I couldn’t guarantee one allocation.
492 fn replace(&self, char_being_removed
: char) -> Option
<char>;
495 /// `None`: just remove the character, don’t replace it.
496 /// `Some`: replace the character with this character.
497 impl Replace
for Option
<char> {
498 fn replace(&self, _
: char) -> Option
<char> {
503 /// Call this function with the character that is being removed,
504 /// and if it returns a character, replace it with that.
505 impl<F
: Fn(char) -> Option
<char>> Replace
for F
{
506 fn replace(&self, c
: char) -> Option
<char> {
511 fn is_most_fs_safe_char(c
: char) -> bool
{
512 c
!= '
/'
&& c
!= '
\0'
515 fn is_url_safe_char(c
: char) -> bool
{
516 // Safe characters are those in the *URL code point* set, minus &, / and ?.
518 // Definitions from the URL and Infra Standards:
520 // > The *URL code points* are ASCII alphanumeric, U+0021 (!), U+0024 ($), U+0026 (&),
521 // > U+0027 ('), U+0028 LEFT PARENTHESIS, U+0029 RIGHT PARENTHESIS, U+002A (*), U+002B (+),
522 // > U+002C (,), U+002D (-), U+002E (.), U+002F (/), U+003A (:), U+003B (;), U+003D (=),
523 // > U+003F (?), U+0040 (@), U+005F (_), U+007E (~), and code points in the range U+00A0 to
524 // > U+10FFFD, inclusive, excluding surrogates and noncharacters.
526 // > A *noncharacter* is a code point that is in the range U+FDD0 to U+FDEF, inclusive, or
527 // > U+FFFE, U+FFFF, U+1FFFE, U+1FFFF, U+2FFFE, U+2FFFF, U+3FFFE, U+3FFFF, U+4FFFE, U+4FFFF,
528 // > U+5FFFE, U+5FFFF, U+6FFFE, U+6FFFF, U+7FFFE, U+7FFFF, U+8FFFE, U+8FFFF, U+9FFFE, U+9FFFF,
529 // > U+AFFFE, U+AFFFF, U+BFFFE, U+BFFFF, U+CFFFE, U+CFFFF, U+DFFFE, U+DFFFF, U+EFFFE, U+EFFFF,
530 // > U+FFFFE, U+FFFFF, U+10FFFE, or U+10FFFF.
532 // Surrogates are already excluded by the `char` data type.
535 'A'
..='Z'
| 'a'
..='z'
| '
0'
..='
9'
|
536 '
!'
| '$'
| /* '&' deliberately excluded */ '
\''
| '
('
| '
)'
| '
*'
| '
+'
| '
,'
| '
-'
|
537 '
.'
| /* '/' deliberately excluded */ '
:'
| '
;'
| '
='
| /* '?' deliberately excluded */
538 '@'
| '_'
| '
~'
| '
\u{a0}'
..='
\u{fdcf}'
| '
\u{fdf0}'
..='
\u{10fffd}'
)
540 // Exclude the remaining noncharacters U+??FFFE and U+??FFFF:
541 && (c
as u32) & 0xfffe != 0xfffe
544 fn is_windows_safe_char(char: char) -> bool
{
546 '
<'
| '
>'
| '
:'
| '
"' | '/' | '\\' | '|' | '?' | '*' |
547 '\u{0}'..='\u{1f}' | '\u{7f}')
550 fn is_space_or_full_stop(c: char) -> bool {
551 matches!(c, ' ' | '.')
554 fn is_more_punctuation_character(c: char) -> bool {
555 matches!(c, '_' | '-' | ',' | ';')
558 fn is_reordering_character(c: char) -> bool {
559 matches!(c, '\u{202A}'..='\u{202E}' | '\u{2066}'..='\u{2069}')
562 fn is_reserved_windows_file_name(name: &str) -> bool {
563 matches!(name.as_bytes(),
564 | [b'C' | b'c', b'O' | b'o', b'N' | b'n']
565 | [b'P' | b'p', b'R' | b'r', b'N' | b'n']
566 | [b'A' | b'a', b'U' | b'u', b'X' | b'x']
567 | [b'N' | b'n', b'U' | b'u', b'L' | b'l']
568 | [b'C' | b'c', b'O' | b'o', b'M' | b'm', b'1'..=b'9']
569 | [b'L' | b'l', b'P' | b'p', b'T' | b't', b'1'..=b'9'])
572 /// Split a name on its final '.', returning (base name, extension) if there is one.
573 /// Both could be empty.
574 fn split_extension(input: &str) -> Option<(&str, &str)> {
580 .find(|(_, c)| **c == b'.')
581 .map(|(dot_index, _)| (&input[..dot_index], &input[dot_index + 1..]))
584 /// Sanitise a file name with the default options.
585 /// See [`Options`] for a description of what all the options do.
587 /// The return value should be suitable as a file name, and will not be empty (if it *would* be
588 /// empty, it’ll be `_` instead, per [`Options::six_measures_of_barley`]’s default).
589 #[cfg(feature = "alloc
")]
590 #[cfg_attr(docsrs, doc(cfg(feature = "alloc
")))]
591 pub fn sanitise(s: &str) -> String {
592 sanitise_with_options(s, &Options::DEFAULT)
595 /// Calculate a sufficient allocation size for the string used. This number will never exceed
596 /// `input.len() + 1 + options.reserve_extra`, and will be less on ridiculously long inputs.
598 /// Only intended for use by crazy allocation-counters like me.
599 pub fn sufficient_alloc_size<R: Replace>(input: &str, options: &Options<R>) -> usize {
600 if options.length_limit < 10 {
603 if options.extension_cleverness {
604 if let Some((base_name, extension)) = split_extension(input) {
605 let extension_length_limit = options.length_limit - 6;
606 let might_add_underscore = |n| {
607 if (n == 3 || n == 4) && options.windows_safe {
614 might_add_underscore(base_name.len()).min(options.length_limit) +
616 extension.len().min(extension_length_limit)
617 // No reserve_extra on this side because this is the size needed *while working*,
618 // but reserve_extra is only needed when we’re done.
620 might_add_underscore(input.len())
621 .min(options.length_limit)
622 .max(options.six_measures_of_barley.len()) +
623 options.reserve_extra
627 (input.len().min(options.length_limit) + if options.windows_safe { '_'.len_utf8() } else { 0 })
628 .max(options.six_measures_of_barley.len()) + options.reserve_extra
631 // Alas, <usize as Ord>::max isn’t const.
632 const fn max(a: usize, b: usize) -> usize { if a > b { a } else { b } }
634 macro_rules! max_alloc_size_body {
635 ($options:ident) => {{
636 if $options.length_limit < 10 {
639 let baseline = max($options.length_limit, $options.six_measures_of_barley.len())
640 + $options.reserve_extra;
641 if $options.extension_cleverness {
642 let extension_length_limit = $options.length_limit - 6;
643 max($options.length_limit + '.'.len_utf8() + extension_length_limit, baseline)
650 #[cfg(not(feature = "const-fn-trait-bound
"))]
651 /// Calculate the maximum allocation size required for a given set of options, to correctly handle
654 /// This is intended for the scratch buffer approach, where you keep one string around and keep on
655 /// sanitising a whole bunch of inputs into it in turn, or for array-allocated strings like with
656 /// `tinyvec_string`.
658 /// This is unfortunately not currently a const fn. If you need a const fn (e.g. to craft an
659 /// precisely-sized `ArrayString`), you may:
661 /// 1. Enable the `const-fn-trait-bound` feature on this crate (requires nightly rustc), which will
662 /// change this function to be const, or
664 /// 2. Use [`max_alloc_size_const`] instead, which requires `R = Option<char>`.
665 /// (There’s also a `tinyvec_string` usage demonstration there.)
666 pub fn max_alloc_size<R: Replace>(options: &Options<R>) -> usize {
667 max_alloc_size_body!(options)
670 #[cfg(feature = "const-fn-trait-bound
")]
671 /// Calculate the maximum allocation size required for a given set of options, to correctly handle
674 /// This is intended for the scratch buffer approach, where you keep one string around and keep on
675 /// sanitising a whole bunch of inputs into it in turn, or for array-allocated strings like with
676 /// `tinyvec_string`.
678 /// This is a const fn because this crate was compiled with the `const-fn-trait-bound` feature
679 /// enabled (which requires nightly rustc at the time of writing).
681 /// See also [`max_alloc_size_const`] for an example of using this with `tinyvec_string`.
682 pub const fn max_alloc_size<R: Replace>(options: &Options<R>) -> usize {
683 max_alloc_size_body!(options)
686 /// A `const` variant of [`max_alloc_size`].
688 /// Sample usage, combined with `tinyvec_string` (with its `rustc_1_55` feature enabled):
691 /// use tinyvec_string::ArrayString;
693 /// ArrayString::<[u8; max_alloc_size_const(&Options::DEFAULT)]>::new();
694 /// sanitise_to("input name
", &Options::DEFAULT, &mut string);
697 /// Once `const-fn-trait-bound` is stabilised, this method will be deprecated.
698 #[cfg_attr(feature = "const-fn-trait-bound
", doc = "\n \
699 Since you compiled this
crate with the `
const-fn-trait-bound` feature
, you don’t need this
700 method
. Be cheerful and
use `max_alloc_size` instead
!")]
701 pub const fn max_alloc_size_const(options: &Options<Option<char>>) -> usize {
702 max_alloc_size_body!(options)
705 /// Sanitise a file name. See [`Options`] for a description of what all the options do.
707 /// The return value should be suitable as a file name for the specified options,
708 /// unless it’s empty which can only happen if the option `six_measures_of_barley` is empty (or if
709 /// the `length_limit` option is illegally small, actually).
710 #[cfg(feature = "alloc
")]
711 #[cfg_attr(docsrs, doc(cfg(feature = "alloc
")))]
712 pub fn sanitise_with_options<R: Replace>(input: &str, options: &Options<R>) -> String {
713 let mut out = String::with_capacity(sufficient_alloc_size(input, options));
716 let initial_capacity = out.capacity();
718 sanitise_to(input, options, &mut out);
721 if initial_capacity != out.capacity() {
722 // I’m serious about this making exactly one allocation. No reallocating allowed.
723 panic!("Capacity changed from {initial_capacity} to {}
(on {:?} → {:?}
)",
724 out.capacity(), input, out);
730 /// A target for sanitisation: essentially the subset of `String` functionality used.
732 /// It might have been nice to use something like `Read + Write + Seek` instead, but the need to
733 /// delete things after writing means that you need still more, and in the end it’s much easier to
734 /// treat it as a string.
736 /// I’ve provided implementations for `String` (if the *alloc* feature is enabled, which it is by
737 /// default) and `tinyvec_string::ArrayString` (if the *tinyvec_string* feature is enabled),
738 /// but there’s nothing preventing you from implementing it on other similar string types.
740 Index<Range<usize>, Output = str> +
741 Index<RangeFrom<usize>, Output = str> +
742 Deref<Target = str> +
745 fn push(&mut self, ch: char);
746 fn push_str(&mut self, string: &str);
747 fn pop(&mut self) -> Option<char>;
748 fn truncate(&mut self, new_len: usize);
749 fn replace_range<R>(&mut self, range: R, replace_with: &str) where R: RangeBounds<usize>;
752 #[cfg(feature = "alloc
")]
753 #[cfg_attr(docsrs, doc(cfg(feature = "alloc
")))]
754 impl Stringy for String {
755 #[inline] fn push(&mut self, ch: char) { self.push(ch) }
756 #[inline] fn push_str(&mut self, string: &str) { self.push_str(string) }
757 #[inline] fn pop(&mut self) -> Option<char> { self.pop() }
758 #[inline] fn truncate(&mut self, new_len: usize) { self.truncate(new_len) }
759 #[inline] fn replace_range<R>(&mut self, range: R, replace_with: &str)
760 where R: RangeBounds<usize>
761 { self.replace_range(range, replace_with) }
764 #[cfg(feature = "tinyvec_string
")]
765 #[cfg_attr(docsrs, doc(cfg(feature = "tinyvec_string
")))]
766 impl<A: tinyvec_string::bytearray::ByteArray> Stringy for tinyvec_string::ArrayString<A> {
767 #[inline] fn push(&mut self, ch: char) { self.push(ch) }
768 #[inline] fn push_str(&mut self, string: &str) { self.push_str(string) }
769 #[inline] fn pop(&mut self) -> Option<char> { self.pop() }
770 #[inline] fn truncate(&mut self, new_len: usize) { self.truncate(new_len) }
771 #[inline] fn replace_range<R>(&mut self, range: R, replace_with: &str)
772 where R: RangeBounds<usize>
773 { self.replace_range(range, replace_with) }
776 /// Sanitise a file name into an existing `String`. Intended for power users only.
778 /// When you use [`sanitise`] or [`sanitise_with_options`], the perfect allocation is artisanally
779 /// crafted (or something). If you use this carelessly, you may actually cause *more* allocations
780 /// to be made, rather than less, or panic if `S` is a non-growable type (e.g.
781 /// `tinyvec::ArrayString`). You may therefore wish to use [`sufficient_alloc_size`] or
782 /// [`max_alloc_size`] or [`max_alloc_size_const`] in some cases to calculate how much more to
783 /// reserve ahead of time.
785 /// See [`Options`] for a description of what all the options do.
787 /// After calling this, `out` will be the same length or longer, never shorter. If you want to know
788 /// *how much* longer, store and compare the length yourself.
789 pub fn sanitise_to<R: Replace, S: Stringy>(input: &str, options: &Options<R>, out: &mut S) {
790 let protected = out.len();
791 // I said in the docs don’t set it to less than 10, but without this zero leads to some
792 // unreachable!() being reached, which is æsthetically displeasing, so I’m just going to return
793 // empty strings for unreasonably small length limits. 🙂
794 if options.length_limit < 10 {
798 // When label-break-value stabilises I’ll switch to that, but until then, loop it is.
799 #[allow(clippy::never_loop)]
800 loop { // breaks after exactly one iteration.
801 if options.extension_cleverness {
802 if let Some((base_name, extension)) = split_extension(input) {
803 // With extension-awareness, when the path exceeds length_limit, we prefer
804 // to truncate from the base name rather than from the extension. But we don’t
805 // know how much we’ll need to truncate until we’ve finished processing the
806 // extension, so we have to allocate a maximum of roughly twice as much as
807 // we’ll end up needing.
809 // For implementation convenience in this corner case, we’ve declared a maximum
810 // extension length of six less than length_limit (explained on
811 // Options::extension_cleverness).
812 let extension_length_limit = options.length_limit - 6;
813 sanitise_part(base_name, options, options.length_limit, false, out, protected);
814 let base_len = out.len() - protected;
816 let extension_truncated = sanitise_part(
819 extension_length_limit,
822 // It’s OK for trimming to take out the entire file name (so we’re deliberately
823 // not including base_len in this), but we mustn’t touch what’s not ours.
827 let mut total_len = out.len() - protected;
829 if total_len > options.length_limit {
830 if extension_truncated {
831 // Extension is unsalvageable: truncate from the end.
832 while total_len > options.length_limit {
834 Some(c) => total_len -= c.len_utf8(),
835 None => unreachable!(),
838 // Length is now acceptable, but that could have left us with
839 // undesirable trailing characters, so run trim again.
840 out.truncate(protected + trim_end(&out[protected..], options, true).len());
842 // (Sigh. Whose brilliant idea was it to try to preserve extensions anyway?
843 // Maybe if I’d realised the pain it’d cause I wouldn’t have bothered.
844 // It’s not like anyone *else* does it. But it is good, say I. And now I’m
845 // even mulling over grapheme-cluster-aware truncation. Am I mad?)
846 let base_name_end_index = base_len;
847 let mut base_chars = out[protected..protected + base_name_end_index].chars();
848 while total_len > options.length_limit {
849 match base_chars.next_back() {
851 total_len -= c.len_utf8();
853 None => unreachable!(),
856 let base_name = trim_end(
857 &out[protected..protected + base_chars.as_str().len()],
861 let range = protected + base_name.len()..protected + base_name_end_index;
862 out.replace_range(range, "");
869 // Extension cleverness disabled, or no extension found: the much simpler path!
870 sanitise_part(input, options, options.length_limit, false, out, protected);
874 // Finally one last bit of processing: checking names that are all dots
875 // (though normally windows_safe will already have truncated them to zero).
876 if (options.url_safe && (&out[protected..] == "." || &out[protected..] == "..")) ||
877 (options.most_fs_safe && out[protected..].chars().all(|c| c == '.'))
879 out.truncate(protected);
882 if out[protected..].is_empty() {
883 out.push_str(options.six_measures_of_barley);
887 fn sanitise_part<R: Replace, S: Stringy>(
889 options: &Options<R>,
896 let mut did_truncate = false;
898 let mut last_was_remove = false;
899 let mut last_was_whitespace = false;
900 out.extend(input.chars()
902 c = if options.normalise_whitespace && c.is_whitespace() { ' ' } else { c };
904 (options.most_fs_safe && !is_most_fs_safe_char(c)) ||
905 (options.windows_safe && !is_windows_safe_char(c)) ||
906 (options.url_safe && !is_url_safe_char(c)) ||
907 (options.remove_control_characters && c.is_control()) ||
908 (options.remove_reordering_characters && is_reordering_character(c))
911 .filter_map(|(c, remove)| {
912 if options.collapse_replacements {
913 if remove && last_was_remove {
916 last_was_remove = remove;
918 if remove { options.replace_with.replace(c) } else { Some(c) }
920 if options.normalise_whitespace {
921 let is_whitespace = c == ' ';
922 let drop = last_was_whitespace && is_whitespace;
923 last_was_whitespace = is_whitespace;
931 (options.trim_spaces_and_full_stops && is_space_or_full_stop(c)) ||
932 (options.trim_more_punctuation && is_more_punctuation_character(c))
935 let new_len = len + c.len_utf8();
936 if new_len <= length_limit {
947 // We’ve added something non-trimmed, that’ll guard the potential reserved name underscore.
948 out.truncate(protected + trim_end(&out[protected..], options, is_extension).len());
951 if !is_extension && options.windows_safe && is_reserved_windows_file_name(&out[protected..]) {
952 // This underscore looks to be in danger of being end-trimmed,
953 // but in practice we’ve ensured that it won’t be
954 // (except maybe one case with a lower length_limit than permitted).
958 // Whew. Finally done. Breathe a sigh of relief.
962 fn trim_end<'a, R: Replace>(out: &'a str, options: &Options<R>, is_extension: bool) -> &'a str {
963 let trim_space_or_full_stop = options.trim_spaces_and_full_stops ||
964 ((is_extension || !options.extension_cleverness) && options.windows_safe);
965 out.trim_end_matches(|c| {
966 (trim_space_or_full_stop && is_space_or_full_stop(c)) ||
967 (options.trim_more_punctuation && is_more_punctuation_character(c))
971 // A concession to those poor Americans et al. 😀
972 #[cfg(feature = "alloc
")]
973 #[cfg_attr(docsrs, doc(cfg(feature = "alloc
")))]
974 pub use sanitise as sanitize;
975 #[cfg(feature = "alloc
")]
976 #[cfg_attr(docsrs, doc(cfg(feature = "alloc
")))]
977 pub use sanitise_with_options as sanitize_with_options;
978 pub use sanitise_to as sanitize_to;
980 // How did this get to almost a thousand lines by this point? I’m sure I started out with only
981 // twenty or so. But then I got careful about allocations, and added extension cleverness, and
982 // added more features, and documented exhaustingly, and oops, a thousand lines, lines that are
983 // convoluted at times. Well, I succeeded in all my *functional* goals, with better precision,
984 // theoretically better but untested speed, better behaviour around extensions, single-allocation
985 // and even *no*-allocation operation; but utterly lost sight of simple and obviously-correct code.
986 // Was it worth it? Eh, probably.
990 #[cfg(feature = "alloc
")]
992 fn test_length_limit_things() {
993 // I wrote these tests before I wrote the matrix. I might as well delete them, but I haven’t.
995 let short = Options {
1000 assert_eq!(sanitise_with_options("abcdef
.ghij
", &short), "abcde
.ghij
");
1002 // Unsalvageable extension
1003 assert_eq!(sanitise_with_options("abcde
.fghij
", &short), "abcde
.fghi
");
1005 // Windows reserved name protection
1006 assert_eq!(sanitise_with_options("AUX
.abcdef
", &short), "AUX_
.abcd
");
1007 assert_eq!(sanitise_with_options("AUX
.abcdef
", &Options { windows_safe: false, ..short }),
1009 assert_eq!(sanitise_with_options("lpT7
.abcdef
", &short), "lpT7_
.abcd
");
1010 assert_eq!(sanitise_with_options("cOm6
.abcdef
", &Options { windows_safe: false, ..short }),
1013 assert_eq!(sanitise("CON
"), "CON_
");
1014 assert_eq!(sanitise("aux
.h
"), "aux_
.h
");
1015 assert_eq!(sanitise("Lpt1
.exe
"), "Lpt1_
.exe
");
1016 assert_eq!(sanitise("xyz
"), "xyz
");
1017 assert_eq!(sanitise(""), "_
");
1018 assert_eq!(sanitise("nül
"), "nül
");
1019 assert_eq!(sanitise("COM1
.jpg
.png
"), "COM1
.jpg
.png
");
1022 #[cfg(feature = "alloc
")]
1025 // Look, I know I said I didn’t want std, but I *need* it for these tests, y’see?
1026 #[cfg(not(feature = "std
"))]
1028 use std::prelude::rust_2021::*;
1029 use std::fmt::Write;
1030 use std::{eprintln, println, format, vec};
1034 #[cfg(feature = "tinyvec_string
")]
1035 A: tinyvec_string::bytearray::ByteArray,
1037 set_name: &'static str,
1038 options_name: &'static str,
1039 options: &Options<R>,
1040 paths: &mut Vec<String>,
1041 unsteady_state: &mut Vec<(&'static str, &'static str, String, String, String)>,
1042 // Apparently you can’t do `case::<#[cfg] A>()`, so we have to do this instead.
1043 #[cfg(feature = "tinyvec_string
")]
1044 _: std::marker::PhantomData<A>,
1046 println!("Sanitising {set_name} with options {options_name}
");
1047 #[cfg(feature = "tinyvec_string
")]
1048 let mut array_string = tinyvec_string::ArrayString::<A>::new();
1049 let mut sanitised = String::new();
1050 let mut capacity = String::new();
1051 let mut scratch = if options_name == "passthrough
" {
1052 // “memory allocation of 18446744073709551615 bytes failed” 😀
1055 String::with_capacity(max_alloc_size(options) + 1)
1057 let scratch_size = scratch.capacity();
1058 for input in std::fs::read_to_string(format!("tests
/{set_name}
.txt
")).unwrap().lines() {
1059 let output = sanitise_with_options(input, options);
1060 // A couple of sanity checks make sense here.
1061 if output.len() > options.length_limit {
1063 "Input {input} sanitised to {output}
, which at {len} is greater than the allowed {max}
",
1065 max = options.length_limit,
1068 if options.windows_safe && is_reserved_windows_file_name(
1069 options.extension_cleverness.then(|| &*output)
1070 .and_then(split_extension)
1071 .map(|(base, _)| base)
1074 panic!("Input {input} sanitised to {output}
, which is a reserved Windows file name
");
1077 // And ensure sanitise_to is working properly also.
1078 scratch.truncate(0);
1079 scratch.push('.'); // A trimmable character, and not six_measures_of_barley.
1080 sanitise_to(input, options, &mut scratch);
1081 assert_eq!(scratch.chars().next(), Some('.'));
1082 assert_eq!(scratch[1..], output);
1083 #[cfg(feature = "tinyvec_string
")]
1085 if array_string.capacity() > 0 {
1086 array_string.truncate(0);
1087 array_string.push(' '); // A trimmable character, and not six_measures_of_barley.
1088 sanitise_to(input, options, &mut array_string);
1089 assert_eq!(array_string.chars().next(), Some(' '));
1090 assert_eq!(array_string[1..], output);
1094 sanitised.push_str(&output);
1095 sanitised.push('\n');
1096 let _ = writeln!(capacity, "{}
", output.capacity());
1097 if input != output {
1098 if options_name == "passthrough
" {
1099 unsteady_state.push(
1100 (set_name, options_name, input.to_owned(), output.clone(), output),
1103 let repeated = sanitise_with_options(&output, options);
1104 if repeated != output {
1105 sanitised.push_str("⚠ Sanitisation did not reach a steady state
. Next line shows the effect of resanitising the line above
. ⚠
\n");
1106 sanitised.push_str(&repeated);
1107 sanitised.push('\n');
1108 unsteady_state.push(
1109 (set_name, options_name, input.to_owned(), output, repeated),
1115 let sanitised_name = format!("tests
/{set_name}
.{options_name}
.sanitised
");
1116 let capacity_name = format!("tests
/{set_name}
.{options_name}
.capacity
");
1117 std::fs::write(&sanitised_name, sanitised).unwrap();
1118 std::fs::write(&capacity_name, capacity).unwrap();
1119 paths.push(sanitised_name);
1120 paths.push(capacity_name);
1121 if options_name != "passthrough
" {
1122 assert_eq!(scratch_size, scratch.capacity(), "scratch buffer reallocated
");
1126 let mut unsteady_state = vec![];
1127 let mut paths = vec![];
1128 let d = Options::DEFAULT;
1129 for name in ["blns
", "misc
"] {
1131 // On $array_size: I tried using roughly { max_alloc_size_const(&options) + 1 },
1132 // but threading it all through was just too painful, especially in the absence of
1133 // const-fn-trait-bound. So I’ll just do one separate test for that.
1134 ($array_size:literal, $options_name:expr, $options:expr) => {
1135 let options = &$options;
1136 // +1 for the ' ' we prefix.
1137 let required_size = max_alloc_size(options).saturating_add(1);
1138 assert!($array_size == 0 || required_size <= $array_size,
1139 "Test case design error
: array being given {} bytes
, but {} are needed
",
1148 &mut unsteady_state,
1149 // TODO: after https://github.com/ThatsNoMoon/tinyvec_string/issues/3 is
1150 // resolved, ditch $array_size and use max_alloc_size instead.
1151 #[cfg(feature = "tinyvec_string
")]
1152 std::marker::PhantomData::<[u8; $array_size]>,
1156 // Assumption that I decline to “test” because it’d be silly:
1157 // sanitise(…) == sanitise_with_options(…, &Options::DEFAULT).
1158 case!(512, "default", d);
1159 case!(512, "realistic
-length_limit
-reduction
", Options { length_limit: Options::DEFAULT.length_limit - 4, ..d });
1160 case!(512, "url_safe
", Options { url_safe: true, ..d });
1161 case!(512, "silly
-replace_with
", Options::DEFAULT.with_replace_with(|c| char::from_u32(c as u32 + 1)));
1162 case!(512, "no
-windows_safe
", Options { windows_safe: false, ..d });
1163 case!(256, "no
-extension_cleverness
", Options { extension_cleverness: false, ..d });
1165 case!(11, "short
-sans
-extension_cleverness
", Options { length_limit: 10, extension_cleverness: false, ..d });
1167 case!(16, "short
", Options { length_limit: 10, ..d });
1168 case!(0, "passthrough
", Options {
1169 length_limit: usize::MAX,
1171 extension_cleverness: false,
1172 most_fs_safe: false,
1173 windows_safe: false,
1175 normalise_whitespace: false,
1176 trim_spaces_and_full_stops: false,
1177 trim_more_punctuation: false,
1178 remove_control_characters: false,
1179 remove_reordering_characters: false,
1181 collapse_replacements: false,
1182 six_measures_of_barley: "",
1184 macro_rules! case_only {
1185 ($option:ident) => {{
1186 let mut options = Options {
1187 most_fs_safe: false,
1188 windows_safe: false,
1190 normalise_whitespace: false,
1191 trim_spaces_and_full_stops: false,
1192 trim_more_punctuation: false,
1193 remove_control_characters: false,
1194 remove_reordering_characters: false,
1197 options.$option = true;
1198 case!(512, concat!("just
-", stringify!($option)), options);
1201 case_only!(most_fs_safe);
1202 case_only!(windows_safe);
1203 case_only!(url_safe);
1204 case_only!(normalise_whitespace);
1205 case_only!(trim_spaces_and_full_stops);
1206 case_only!(trim_more_punctuation);
1207 case_only!(remove_control_characters);
1208 case_only!(remove_reordering_characters);
1209 // Eh, I’m bored now. That’ll do.
1212 let mut complain_of_unsteady_states = false;
1213 if !unsteady_state.is_empty() {
1214 for (set, options, original, first, second) in &unsteady_state {
1215 match (*set, *options, &**original, &**first, &**second) {
1216 ("blns
", "short
", "Dr
. Herman I
. Libshitz
", "Dr
. Herman
", "Dr
.Herm
") |
1217 ("blns
", "short
", r#"{{ "".__class__
.__mro__
[2].__subclasses__()[40]("/etc/passwd").read() }}"#, "{{ __
.__cl
", "{{.cl
") => {
1218 // Skip known cases of unsalvageable extensions combining with dots in the base
1219 // name to effectively give a new extension, making quiescence take two steps.
1220 // Making these steady-state would take too much effort, and the harm is
1221 // minimal (the unsteady state is still a correctly sanitised name).
1223 (_, "silly
-replace_with
", _, _, _) => {
1224 // Certainly this one isn’t steady-state!
1226 (_, "passthrough
", _, _, _) => {
1227 complain_of_unsteady_states = true;
1228 eprintln!("Unsteady state
in {set} with {options} options
, diff tests
/{set}
.{options}
.txt and tests
/{set}
.{options}
.sanitised
");
1231 complain_of_unsteady_states = true;
1232 eprintln!("Unknown unsteady state
in {set} with {options} options
, look
for the ⚠ symbol
in tests
/{set}
.{options}
.sanitised
");
1238 if !std::process::Command::new("git
")
1247 panic!("sanitisation produced different results than are known
, review the diffs
");
1250 // A guard against committing an unsteady state.
1251 if complain_of_unsteady_states {
1252 panic!("Some sanitisations unexpectedly failed to reach a steady state
.");
1256 #[cfg(feature = "tinyvec_string
")]
1259 fn test_tinyvec_string_panic() {
1260 // I’ve already tested various normal cases, including that ridiculously long strings don’t
1261 // cause overflow on moderately limited arrays with moderate length limits; but I haven’t
1262 // demonstrated the panic that occurs if the array is too short. So here’s this now. 🙂
1266 &mut tinyvec_string::ArrayString::<[u8; 12]>::new(),
1270 #[cfg(feature = "tinyvec_string
")]
1272 fn test_tinyvec_string_max_alloc_size() {
1273 use tinyvec_string::ArrayString;
1274 // Note: this is *currently* 505, but I declare that not part of the compatibility contract;
1275 // extension cleverness/grapheme cluster changes could lead to it increasing to 510.
1276 let mut string: ArrayString<[u8; 505]> =
1277 ArrayString::<[u8; max_alloc_size_const(&Options::DEFAULT)]>::new();
1278 assert_eq!(string.capacity(), 505);
1280 "Watch everything being hunky dory
, even when I throw unreasonably long values at it all
. \
1281 Even when dots
become extension separators
; yes
, even then
. Then further
: into the deep
\
1282 reaches of testing
, where things start to get garbled
, and having written at least
255 \
1283 characters
, I have to now write just
as much of extension—horror
. But that was a full
\
1284 stop so that this could now be the extension
, and I can’t put a dot
for the next
while›
\
1285 ¿Whatever will I
do? I suspect things are getting out of hand here
, but I can’t stop now
; \
1286 I’m running out of things to write
, but it should be enough by now
!",
1290 assert_eq!(string, "Watch everything being hunky dory
, even when I throw unreasonably long
\
1291 values at it all
. Even when dots
become extension separators
; yes
, even then
. Then
\
1292 further_ into the deep reaches of testing
, where things start to get garbled
, and having
\