src/lib.rs - sanitise-file-name blob - Chris Morgan’s Git repositories

   1 //! Sanitise names for use in file systems and the likes.
   2 //!
   3 //! The output string is guaranteed to be shorter than or equal to the input string in length,
   4 //! except for file names that are reserved on Windows (see [`Options::windows_safe`]), in which
   5 //! case an underscore is appended to the base name (e.g. NUL → NUL_, aux.h → aux_.h).
   6 //!
   7 //! The key parts of the API:
   8 //!
   9 #![cfg_attr(feature = "alloc", doc = "\
  10  - <code>[sanitise][](input: &str) -> String</code>: the simplest thing to call;
  11
  12  - <code>[sanitise_with_options][](input: &str, options: &Options&lt;_>) -> String</code>:
  13    when you want to tweak the nature of the sanitisation; and")]
  14 #![cfg_attr(not(feature = "alloc"), doc = "\
  15  - <s><code>[sanitise][](input: &str) -> String</code>: the simplest thing to call</s>
  16    *(disabled in this build due to compiling without the `alloc` feature)*;
  17
  18  - <s><code>[sanitise_with_options][](input: &str, options: &Options&lt;_>) -> String</code>:
  19    when you want to tweak the nature of the sanitisation</s> *(disabled in this build due to
  20    compiling without the `alloc` feature)*; and")]
  21 //!
  22 //! - [`Options`], with detailed descriptions of each option.
  23 //!
  24 //! And for advanced users that want to control allocations or other similar things:
  25 //!
  26 //! - <code>[sanitise_to][](input: &str, options: &Options&lt;_>, out: &mut <em>String</em>)</code>,
  27 //!   sanitising into
  28 #![cfg_attr(feature = "alloc", doc = "   a `String`")]
  29 #![cfg_attr(not(feature = "alloc"), doc = "   a string")]
  30 #![cfg_attr(feature = "tinyvec_string", doc = "   or [`tinyvec_string::ArrayString`]")]
  31 #![cfg_attr(all(docsrs, feature = "tinyvec_string"), doc = "   (when enabled)")]
  32 //!   that you provide,
  33 //!   for which the following methods may help:
  34 //!
  35 //! - <code>[max_alloc_size][](options: &Options&lt;_>)</code> or
  36 //!   <code>[max_alloc_size_const][](options: &Options&lt;Option&lt;char>>)</code>,
  37 //!   to suggest a size for scratch buffer
  38 #![cfg_attr(feature = "tinyvec_string", doc = "   or `ArrayString`")]
  39 //!   applications; and
  40 //!
  41 //! - <code>[sufficient_alloc_size][](input: &str, options: &Options&lt;_>) -> usize</code>, to
  42 //!   suggest a size that will definitely be sufficient for one given input (mainly useful when you
  43 //!   are crafting a path with stuff before and after it).
  44 //!
  45 //! … but that’s dangerous territory, deep rabbit holes; ask if you actually *need* them—don’t be
  46 //! like me. (When I am laid in earth, may my wrongs create no trouble in thy breast. Remember me,
  47 //! but ah! forget my fate.)
  48 //!
  49 //! ### Conditional compilation/Cargo features
  50 //!
  51 //! This crate has several features:
  52 //!
  53 //! - **std**, enabled by default. Implies *alloc*. Disable it to get `#![no_std]` operation.
  54 //!
  55 //! - **alloc**, enabled by default via *std*. Provides the ability to sanitise to a `String` in
  56 //!   `sanitise_to`, and the `sanitise` and `sanitise_with_options` functions.
  57 //!
  58 //! - **tinyvec_string**, disabled by default. Provides the ability to sanitise to
  59 //!   `tinyvec_string::ArrayString`, which works without *alloc*.
  60 //!
  61 //! - **const-fn-trait-bound**, disabled by default, requires rustc nightly at the time of writing.
  62 //!   Makes [`max_alloc_size`] const.
  63 //!
  64 //! These docs were built with these features enabled:
  65 #![cfg_attr(feature = "std", doc = " <span class='stab portability'><code>std</code></span>")]
  66 #![cfg_attr(feature = "alloc", doc = " <span class='stab portability'><code>alloc</code></span>")]
  67 #![cfg_attr(feature = "tinyvec_string", doc = " <span class='stab portability'><code>tinyvec_string</code></span>")]
  68 #![cfg_attr(feature = "const-fn-trait-bound", doc = " <span class='stab portability'><code>const-fn-trait-bound</code></span>")]
  69 #![cfg_attr(
  70     all(
  71         not(feature = "std"),
  72         not(feature = "alloc"),
  73         not(feature = "tinyvec_string"),
  74         not(feature = "const-fn-trait-bound"),
  75     ),
  76     doc = " *(none of them)*")]
  77 //!
  78 //! … and these features disabled:
  79 #![cfg_attr(not(feature = "std"), doc = " <span class='stab portability'><code>std</code></span>")]
  80 #![cfg_attr(not(feature = "alloc"), doc = " <span class='stab portability'><code>alloc</code></span>")]
  81 #![cfg_attr(not(feature = "tinyvec_string"), doc = " <span class='stab portability'><code>tinyvec_string</code></span>")]
  82 #![cfg_attr(not(feature = "const-fn-trait-bound"), doc = " <span class='stab portability'><code>const-fn-trait-bound</code></span>")]
  83 #![cfg_attr(
  84     all(
  85         feature = "std",
  86         feature = "alloc",
  87         feature = "tinyvec_string",
  88         feature = "const-fn-trait-bound",
  89     ),
  90     doc = " *(none of them)*")]
  91
  92 // End docs.
  93
  94 #![cfg_attr(not(feature = "std"), no_std)]
  95 #![cfg_attr(feature = "const-fn-trait-bound", feature(const_fn_trait_bound))]
  96 #![cfg_attr(docsrs, feature(doc_cfg))]
  97 #![cfg_attr(not(feature = "alloc"), allow(rustdoc::broken_intra_doc_links))]  // I’m lazy.
  98
  99 #[cfg(feature = "alloc")]
 100 extern crate alloc;
 101 #[cfg(feature = "alloc")]
 102 use alloc::string::String;
 103
 104 use core::ops::{Deref, Index, Range, RangeFrom, RangeBounds};
 105
 106 /// Sanitisation options. Defaults marked on each field.
 107 ///
 108 /// Take a look around, but I think everything’s pretty sane by default; the ones I think you’re
 109 /// most likely to want to change are `url_safe` and `windows_safe`, though `replace_with`,
 110 /// `collapse_replacements` and `six_measures_of_barley` can be interesting too for yielding
 111 /// prettier results.
 112 ///
 113 /// If you set `length_limit` to `usize::MAX`, all the bool fields to `false`, and
 114 /// `six_measures_of_barley` to an empty string, `sanitise` will not alter the input string in any
 115 /// way. But that would be a rather expensive alternative to `.clone()`. In practice, I doubt you
 116 /// ever want to disable `most_fs_safe`, which is a good baseline.
 117 #[derive(Debug)]
 118 pub struct Options<R: Replace> {
 119     /// Limit the complete file name to this many UTF-8 code units. The default is **255**, which
 120     /// is suitable for all practical platforms.
 121     ///
 122     /// (Some file systems limit lengths in UTF-8 code units and some in UTF-16 code units, but
 123     /// UTF-16 never takes more code units than UTF-8 to encode a given Unicode string, so we can
 124     /// ignore it.)
 125     ///
 126     /// Reasons you might want to reduce it:
 127     ///
 128     /// 1. You haven’t appended the extension yet, and so want to subtract the extension’s length.
 129     ///    (In that case I suggest writing `Options::DEFAULT.length_limit` instead of hard coding
 130     ///    255—that’ll work in const context.)
 131     ///
 132     /// 2. You want smoother Windows support, for on Windows some things start falling over if the
 133     ///    total path length is greater than 260 characters; so measuring or estimating the path
 134     ///    length could potentially be useful—but unless you know, probably don’t worry too much,
 135     ///    someone’ll probably drop it deep in a node_modules tree at some point and then you’ll be
 136     ///    in trouble anyway. 😀 <!-- Okay, okay, so node_modules trees aren’t typically flattened
 137     ///    almost entirely these days; but let me have my joke, please? -->
 138     ///
 139     /// One other mildly significant note here: if you care about Apple’s pre-2017 HFS+ file
 140     /// system, you should perform Unicode normalisation to NFD (most likely via the
 141     /// `unicode-normalization` crate) before performing sanitisation, because the decomposed form
 142     /// may be longer; if you don’t, then the path will be normalised to NFD by the file system
 143     /// when you try to write it, which could take it over 255 and make it fail. I don’t think
 144     /// there are any popular file systems that normalise any more, though APFS kinda prefers NFC,
 145     /// so you might want to normalise to NFC. I do not know if normalising to NFC will ever
 146     /// lengthen a UTF-8 string, but the spec allows it to (UAX #15, goal 3.2).
 147     ///
 148     /// The minimum permitted value is 10, for reasons of implementation convenience and because I
 149     /// don’t think there’s any legitimate use case for a smaller value. If you provide a value
 150     /// less than ten, you’ll get an empty string back every time.
 151     ///
 152     /// Truncations are performed at `char` granularity (Unicode scalar value), which means that
 153     /// extended grapheme clusters could be broken. This could change in the future (it’ll be an
 154     /// optional dependency on `unicode-segmentation`), but for now it was just too much thought.
 155     /// If I ever implement this, I’ll probably ditch the minimum value of 10 too.
 156     // (Most significantly, it doesn’t play terribly nicely with extension cleverness: six would no
 157     // longer be sufficient to guarantee a base name, so more involved calculations and overflow
 158     // tracking would need to be done. It’s perfectly achievable, but painful.)
 159     pub length_limit: usize,
 160
 161     /// When allocating the string (since it allocates as small a string as possible), reserve at
 162     /// least this many extra bytes. This is good for efficiency when you append the extension
 163     /// after sanitisation (in which case, also disable `extension_cleverness`). Default **0**.
 164     pub reserve_extra: usize,
 165
 166     /// Make other options try to be clever about a file extension in the input. Default `true`.
 167     ///
 168     /// Specifically, if a file extension is detected (done by looking for the last full stop in
 169     /// the name, and splitting at that point into base name and extension):
 170     ///
 171     /// 1. `length_limit` will try to keep the extension intact, truncating the base name rather
 172     ///    than the extension. “Try”, because if the extension is longer than six code units less
 173     ///    than the length limit, it will be deemed unsalvageable. (Why six? The base name must
 174     ///    retain at least one character, so for convenience that’s four UTF-8 code units, plus one
 175     ///    more for the dot, and if `windows_safe` is on, the longest reserved name causes a five
 176     ///    code unit base name like `LPT1_`, and ridiculously long extensions are a corner case
 177     ///    anyway so I decided to just call it a day at six. If I subsequently implement
 178     ///    grapheme-cluster-aware truncation, this six will increase if the first grapheme cluster
 179     ///    in the base name is more than five code units long.) An unsalvageable extension is the
 180     ///    only case where sanitisation may take two steps to quiesce, rather than one: if the
 181     ///    extension is entirely truncated and the base name contains a dot which in a subsequent
 182     ///    run will be interpreted as the extension separator, trimming will happen around it on
 183     ///    that subsequent run but not the first.
 184     ///
 185     /// 2. `windows_safe` will detect reserved names with extensions.
 186     ///
 187     /// 3. `trim_spaces_and_full_stops` and `trim_more_punctuation` will trim those characters from
 188     ///    the end of the base name and the start of the extension, in addition to the start and
 189     ///    end of the full name. (Expressed otherwise, the base name and extension will be trimmed
 190     ///    independently.)
 191     ///
 192     /// If you’re appending the extension after sanitisation, you should turn this to false.
 193     pub extension_cleverness: bool,
 194
 195     /// Remove characters that are not safe on just about any file system. Default `true`, and if
 196     /// you actually want to disable it you’re probably using the wrong crate.
 197     ///
 198     /// This plus `length_limit` is enough to satisfy most platforms other than Windows, though
 199     /// cleaning somewhat more is probably a good idea.
 200     ///
 201     /// Characters removed:
 202     ///
 203     /// - `/` (slash)
 204     /// - ␀ (null, character zero)
 205     ///
 206     /// Also disallows names comprising exclusively dots (`"."`, `".."`, `"..."`, *&c.*), NOT using
 207     /// `replace_with` on them but yielding an empty string.
 208     ///
 209     /// This is a tiny subset of `windows_safe`.
 210     pub most_fs_safe: bool,
 211
 212     /// Ensure the file name is safe on Windows. Default `true`.
 213     ///
 214     /// [These are the rules applied:](https://docs.microsoft.com/en-au/windows/win32/fileio/naming-a-file#naming-conventions)
 215     ///
 216     /// - These characters are removed (and `replace_with` employed):
 217     ///
 218     ///   - `<` (less than)
 219     ///   - `>` (greater than)
 220     ///   - `:` (colon)
 221     ///   - `"` (double quote)
 222     ///   - `/` (forward slash)
 223     ///   - `\` (backslash)
 224     ///   - `|` (vertical bar/pipe)
 225     ///   - `?` (question mark)
 226     ///   - `*` (asterisk)
 227     ///   - The C0 control characters, 0–31 and 127 (U+0000–U+001F, U+007F); note that U+007F isn’t
 228     ///     actually part of C0, but Microsoft included it in this list so I do too.
 229     ///
 230     /// - Names must not end with a space or a dot (so these are removed recursively—for reasons of
 231     ///   technical convenience, `replace_with` is NOT employed).
 232     ///
 233     /// - These names are reserved (and so a trailing underscore is added to the base name),
 234     ///   including with an extension if `extension_cleverness` is enabled:
 235     ///
 236     ///   - CON, PRN, AUX, NUL,
 237     ///   - COM1, COM2, COM3, COM4, COM5, COM6, COM7, COM8, COM9,
 238     ///   - LPT1, LPT2, LPT3, LPT4, LPT5, LPT6, LPT7, LPT8, and LPT9
 239     ///
 240     /// Most of these restrictions are actually not quite universal in Windows, but getting around
 241     /// them requires switching into POSIX mode or using long UNC paths (e.g. `\\.\C:\CON`,
 242     /// `\\?\D:\aux.h`), and your life will certainly be miserable if you try using them; so
 243     /// they’re all considered not Windows-safe.
 244     pub windows_safe: bool,
 245
 246     /// Remove characters that may be problematic in the usual places in URLs. Default `false`.
 247     ///
 248     /// If you want something URL-safe, consider slugifying instead (see below).
 249     ///
 250     /// This removes any character that is not what’s called a [*URL code point*], also removes the
 251     /// characters `&`, `/` and `?`, and forbids the names `.` and `..` which have a special
 252     /// meaning in paths. The result is either an empty string, or suitable for use as a path
 253     /// component, query string value or fragment, without generally *needing* percent-encoding:
 254     /// such a URL will be correctly parsed by a WHATWG URL Standard parser, though nominally
 255     /// invalid¹, but older or poorer-quality URL parsers may need percent-encoding to cope with
 256     /// the non-ASCII that is retained.
 257     ///
 258     /// Some notable characters that are removed: `/`, `\`, `%`, `?`, `#`, `&`, `"`, and space.
 259     ///
 260     /// Almost all non-ASCII is retained.
 261     ///
 262     /// Notes on using these URLs in some common formats:
 263     ///
 264     /// - In HTML, no escaping is needed in `<a href="http://www.example/fïle_ñamê">`, because `&`
 265     ///   and `"` are the only two characters needing escaping in a double-quoted attribute value,
 266     ///   and both are removed by `url_safe`.
 267     ///
 268     /// - In plain text formats following the longstanding convention of angle bracket delimition
 269     ///   (`<http://www.example/lïke_τhis>`), no escaping should be required as `>` is removed by
 270     ///   `url_safe`. This includes Markdown. However, some such parsers could be stricter about
 271     ///   what’s allowed inside the angle brackets, so you may need or want to use a URL Standard
 272     ///   serialiser to do percent-encoding of the non-ASCII.
 273     ///
 274     /// - In Markdown `[text](href)` links, you’ll want to manually percent-encode `(` to `%28` and
 275     ///   `)` to `%29`. This is yet another bad choice in Markdown’s technical foundation:
 276     ///   parentheses aren’t percent-encoded, never have been; so using a URL Standard serialiser
 277     ///   won’t help you, you’ll instead need to manually encode them, or unpaired parentheses will
 278     ///   break the link and possibly eat your laundry².
 279     ///
 280     /// Given that this produces nominally-invalid URLs, you may be wondering why to bother at all;
 281     /// it really comes down to characters like `?`, `/` and `#`: you *can* include them in paths
 282     /// by percent-encoding, but it’s too likely that *somewhere* along the way, *something* will
 283     /// mangle your path, not encoding it properly, and everything will break—basically the entire
 284     /// *system* has to process the URL correctly; ever tried a path component containing `%2F`?
 285     /// But if you’ve removed the genuinely problematic characters, then in theory things can no
 286     /// longer go wrong once you’re past the parser. And being able to skip percent-encoding your
 287     /// URLs when you know you’ll be using a proper URL parser is nice.
 288     ///
 289     /// I deliberately haven’t provided an option for removing characters that would make a URL
 290     /// nominally invalid (which is “non-ASCII”), because I think that goes too far: in such a
 291     /// case, I don’t think you should *strip* such characters, but rather slugify the whole thing
 292     /// (which can do things like `Voilà!` → `voila`).
 293     ///
 294     /// `replace_with` is used for the character removals, but NOT for the forbidding of the names
 295     /// `"."` and `".."`, for which it will instead yield an empty string.
 296     ///
 297     /// —⁂—
 298     ///
 299     /// ¹ “Invalid” is just a label in WHATWG specs; it doesn’t change anything, and parsing is
 300     ///   still well-defined, it’s generally just a hint that either you may have made a mistake,
 301     ///   or that older tools might not handle this case the same way.
 302     ///
 303     /// ² When Americans say “eat your laundry” they mean the *clothes*. An Australian seeking to
 304     ///   express *that* concept would say “eat your washing” (and probably be looked at strangely
 305     ///   because it’s not an expression in common use). The laundry is the room in which clothes
 306     ///   are washed; so when I say injection attacks might eat your laundry——
 307     ///
 308     /// [*URL code point*]: https://url.spec.whatwg.org/#url-code-points
 309     pub url_safe: bool,
 310
 311     /// Replace all sequences of whitespace with one space. Default `true`.
 312     ///
 313     /// This uses the Unicode `White_Space` property to decide ([`char::is_whitespace`]).
 314     ///
 315     /// This is done in two phases:
 316     ///
 317     /// 1. Before safety character replacements, each whitespace character is normalised to a
 318     ///    U+0020 SPACE; `replace_with` is not invoked.
 319     ///
 320     /// 2. After all character replacements, adjacent spaces (including any produced by
 321     ///    `replace_with`, independent of `collapse_replacements`) are collapsed to just one.
 322     pub normalise_whitespace: bool,
 323
 324     /// Remove spaces and full stops (`.`) from the start and end of the name. Default `true`.
 325     ///
 326     /// `normalise_whitespace` is performed before this; with it on, this will trim all whitespace,
 327     /// with it off it’ll only trim U+0020 SPACE.
 328     ///
 329     /// All things that invoke `replace_with` are performed before this; thus, if you replace a
 330     /// character with a space or full stop, that could get trimmed. `replace_with` is not invoked
 331     /// on any characters removed by this.
 332     ///
 333     /// If `extension_cleverness` is enabled (which it is by default), on names with an extension
 334     /// this trims from the start and end of the base name and extension independently, rather than
 335     /// just the start and end of the full string. That is, `" foo . bar . baz "` will become
 336     /// `"foo . bar.baz"` with `extension_cleverness`, and `"foo . bar . baz"` without.
 337     ///
 338     /// This is independent of `windows_safe`, which also trims trailing spaces and dots from the
 339     /// complete name.
 340     // BTW: U+002E is named FULL STOP, oh uncouth Americans. 😀
 341     pub trim_spaces_and_full_stops: bool,
 342
 343     /// Remove a few more punctuationy characters from the start and end of the name.
 344     /// Default `true`.
 345     ///
 346     /// This is a more aggressive supplement to `trim_spaces_and_full_stops`, trimming from the
 347     /// same places in the same way. These characters are removed:
 348     ///
 349     /// - `_` (underscore; especially significant because `replace_with` defaults to an underscore)
 350     /// - `-` (hyphen/dash/minus)
 351     /// - `,` (comma)
 352     /// - `;` (semicolon)
 353     pub trim_more_punctuation: bool,
 354
 355     /// Remove control characters. Default `true`.
 356     ///
 357     /// This removes all characters with the general category *Control*: C0 controls U+0000–U+001F,
 358     /// control character U+007F, and C1 controls U+0080–U+009F.
 359     ///
 360     /// `replace_with` is invoked on these removals.
 361     pub remove_control_characters: bool,
 362
 363     /// Remove BiDi control characters that are relevant to reordering attacks. Default `true`.
 364     ///
 365     /// <https://trojansource.codes/trojan-source.pdf> is a paper with info about the attack.
 366     ///
 367     /// This removes U+202A–U+202E and U+2066–U+2069. It does NOT remove the remaining three
 368     /// Bidi_Control characters U+061C, U+200E and U+200F (ALM, LRM, RLM),
 369     /// which are not implicated in the attack and are conceivably useful in file names.
 370     ///
 371     /// `replace_with` is invoked on these removals.
 372     pub remove_reordering_characters: bool,
 373
 374     /// Where characters are removed (except as marked), replace them with this.
 375     /// Default `Some('_')`.
 376     ///
 377     /// If you provide a character that would normally be removed, it will not be removed: that
 378     /// processing is done once only.
 379     ///
 380     /// If you provide a character that would be trimmed, it may or may not be trimmed: end matches
 381     /// will be trimmed, start matches only will be if ridiculously long names and/or extensions
 382     /// force unusual truncation, exposing the start of the string (so that it gets trimmed to
 383     /// nothing).
 384     pub replace_with: R,
 385
 386     /// Where multiple adjacent characters are to be replaced, only replace the first, and remove
 387     /// any subsequent ones. Default `false`.
 388     ///
 389     /// See also `normalise_whitespace`, which can collapse replacements if you replace with
 390     /// whitespace.
 391     pub collapse_replacements: bool,
 392
 393     /// If sanitisation would leave the path empty, return this string instead. Default `"_"`.
 394     ///
 395     /// This exists because I found myself writing `if name.is_empty() { name.push('_') }` after
 396     /// every time I called `sanitise`. I think most of the time you don’t want to be left with an
 397     /// empty string, and inserting *something* is tolerable, so this is on by default as something
 398     /// fairly neutral that aligns with the `replace_with` default as well. You can effectively
 399     /// disable this by setting this to an empty string.
 400     ///
 401     /// `length_limit` is not taken into account on this. If you put something ridiculously long in
 402     /// it, you brought it on yourself and I wash my hands of it, as Pontius Pilate of old.
 403     ///
 404     /// (Read *Ruth 3:15–17* from the Bible to understand the name of this option.)
 405     pub six_measures_of_barley: &'static str,
 406 }
 407
 408 // Implemented on just one type for inference reasons. One might wonder why I use an associated
 409 // constant at all. This would not be an unreasonable thing to wonder.
 410 impl Options<Option<char>> {
 411     /// The default options. This is more useful than `Options::default()` (which just returns
 412     /// this) because it’s const, so you can access `Options::DEFAULT.length_limit` in const
 413     /// context.
 414     pub const DEFAULT: Self = Options {
 415         length_limit: 255,
 416         reserve_extra: 0,
 417         extension_cleverness: true,
 418         most_fs_safe: true,
 419         windows_safe: true,
 420         url_safe: false,
 421         normalise_whitespace: true,
 422         trim_spaces_and_full_stops: true,
 423         trim_more_punctuation: true,
 424         remove_control_characters: true,
 425         remove_reordering_characters: true,
 426         replace_with: Some('_'),
 427         collapse_replacements: false,
 428         six_measures_of_barley: "_",
 429     };
 430 }
 431
 432 impl Default for Options<Option<char>> {
 433     fn default() -> Self {
 434         Self::DEFAULT
 435     }
 436 }
 437
 438 impl<R: Replace> Options<R> {
 439     /// A workaround for an otherwise-messy type situation with filling in defaults.
 440     ///
 441     /// This solves the problem that you can’t write this:
 442     ///
 443     /// ```rust,ignore
 444     /// Options { replace_with: |c| /* … */, ..Options::DEFAULT }
 445     /// ```
 446     ///
 447     /// … because struct update syntax doesn’t currently allow you to change types, and
 448     /// `Options::DEFAULT` is an `Options<Option<char>>`, but with a closure for `replace_with`
 449     /// you’re needing to change it to `Options<[closure@…]>`. So instead, write like one of these:
 450     ///
 451     /// ```rust,ignore
 452     /// Options::DEFAULT.with_replace_with(|c| /* … */)
 453     /// Options { /* … */, ..Options::DEFAULT }.with_replace_with(|c| /* … */)
 454     /// ```
 455     ///
 456     /// If you’re using nightly rustc, you can try the [incomplete type-changing-struct-update
 457     /// feature](https://github.com/rust-lang/rust/issues/86555) instead, which lets the first code
 458     /// work (so long as this unstable and incomplete feature is working):
 459     ///
 460     /// ```rust,ignore
 461     /// #![feature(type_changing_struct_update)]
 462     /// use sanitise_file_name::Options;
 463     ///
 464     /// fn main() {
 465     ///     Options { replace_with: |c| /* … */, ..Options::DEFAULT }
 466     /// }
 467     /// ```
 468     pub fn with_replace_with<R2: Replace>(self, new_replace_with: R2) -> Options<R2> {
 469         Options {
 470             length_limit: self.length_limit,
 471             reserve_extra: self.reserve_extra,
 472             extension_cleverness: self.extension_cleverness,
 473             most_fs_safe: self.most_fs_safe,
 474             windows_safe: self.windows_safe,
 475             url_safe: self.url_safe,
 476             normalise_whitespace: self.normalise_whitespace,
 477             trim_spaces_and_full_stops: self.trim_spaces_and_full_stops,
 478             trim_more_punctuation: self.trim_more_punctuation,
 479             remove_control_characters: self.remove_control_characters,
 480             remove_reordering_characters: self.remove_reordering_characters,
 481             replace_with: new_replace_with,
 482             collapse_replacements: self.collapse_replacements,
 483             six_measures_of_barley: self.six_measures_of_barley,
 484         }
 485     }
 486 }
 487
 488 /// See [`Options::replace_with`].
 489 pub trait Replace {
 490     // “Why no *string* replacement?” I hear you ask.
 491     // Because then I couldn’t guarantee one allocation.
 492     fn replace(&self, char_being_removed: char) -> Option<char>;
 493 }
 494
 495 /// `None`: just remove the character, don’t replace it.
 496 /// `Some`: replace the character with this character.
 497 impl Replace for Option<char> {
 498     fn replace(&self, _: char) -> Option<char> {
 499         *self
 500     }
 501 }
 502
 503 /// Call this function with the character that is being removed,
 504 /// and if it returns a character, replace it with that.
 505 impl<F: Fn(char) -> Option<char>> Replace for F {
 506     fn replace(&self, c: char) -> Option<char> {
 507         self(c)
 508     }
 509 }
 510
 511 fn is_most_fs_safe_char(c: char) -> bool {
 512     c != '/' && c != '\0'
 513 }
 514
 515 fn is_url_safe_char(c: char) -> bool {
 516     // Safe characters are those in the *URL code point* set, minus &, / and ?.
 517     //
 518     // Definitions from the URL and Infra Standards:
 519     //
 520     // > The *URL code points* are ASCII alphanumeric, U+0021 (!), U+0024 ($), U+0026 (&),
 521     // > U+0027 ('), U+0028 LEFT PARENTHESIS, U+0029 RIGHT PARENTHESIS, U+002A (*), U+002B (+),
 522     // > U+002C (,), U+002D (-), U+002E (.), U+002F (/), U+003A (:), U+003B (;), U+003D (=),
 523     // > U+003F (?), U+0040 (@), U+005F (_), U+007E (~), and code points in the range U+00A0 to
 524     // > U+10FFFD, inclusive, excluding surrogates and noncharacters.
 525     //
 526     // > A *noncharacter* is a code point that is in the range U+FDD0 to U+FDEF, inclusive, or
 527     // > U+FFFE, U+FFFF, U+1FFFE, U+1FFFF, U+2FFFE, U+2FFFF, U+3FFFE, U+3FFFF, U+4FFFE, U+4FFFF,
 528     // > U+5FFFE, U+5FFFF, U+6FFFE, U+6FFFF, U+7FFFE, U+7FFFF, U+8FFFE, U+8FFFF, U+9FFFE, U+9FFFF,
 529     // > U+AFFFE, U+AFFFF, U+BFFFE, U+BFFFF, U+CFFFE, U+CFFFF, U+DFFFE, U+DFFFF, U+EFFFE, U+EFFFF,
 530     // > U+FFFFE, U+FFFFF, U+10FFFE, or U+10FFFF.
 531     //
 532     // Surrogates are already excluded by the `char` data type.
 533
 534     matches!(c,
 535         'A'..='Z' | 'a'..='z' | '0'..='9' |
 536         '!' | '$' | /* '&' deliberately excluded */ '\'' | '(' | ')' | '*' | '+' | ',' | '-' |
 537         '.' | /* '/' deliberately excluded */ ':' | ';' | '=' | /* '?' deliberately excluded */
 538         '@' | '_' | '~' | '\u{a0}'..='\u{fdcf}' | '\u{fdf0}'..='\u{10fffd}')
 539
 540     // Exclude the remaining noncharacters U+??FFFE and U+??FFFF:
 541     && (c as u32) & 0xfffe != 0xfffe
 542 }
 543
 544 fn is_windows_safe_char(char: char) -> bool {
 545     !matches!(char,
 546         '<' | '>' | ':' | '"' | '/' | '\\' | '|' | '?' | '*' |
 547         '\u{0}'..='\u{1f}' | '\u{7f}')
 548 }
 549
 550 fn is_space_or_full_stop(c: char) -> bool {
 551     matches!(c, ' ' | '.')
 552 }
 553
 554 fn is_more_punctuation_character(c: char) -> bool {
 555     matches!(c, '_' | '-' | ',' | ';')
 556 }
 557
 558 fn is_reordering_character(c: char) -> bool {
 559     matches!(c, '\u{202A}'..='\u{202E}' | '\u{2066}'..='\u{2069}')
 560 }
 561
 562 fn is_reserved_windows_file_name(name: &str) -> bool {
 563     matches!(name.as_bytes(),
 564         | [b'C' | b'c', b'O' | b'o', b'N' | b'n']
 565         | [b'P' | b'p', b'R' | b'r', b'N' | b'n']
 566         | [b'A' | b'a', b'U' | b'u', b'X' | b'x']
 567         | [b'N' | b'n', b'U' | b'u', b'L' | b'l']
 568         | [b'C' | b'c', b'O' | b'o', b'M' | b'm', b'1'..=b'9']
 569         | [b'L' | b'l', b'P' | b'p', b'T' | b't', b'1'..=b'9'])
 570 }
 571
 572 /// Split a name on its final '.', returning (base name, extension) if there is one.
 573 /// Both could be empty.
 574 fn split_extension(input: &str) -> Option<(&str, &str)> {
 575     input
 576         .as_bytes()
 577         .iter()
 578         .enumerate()
 579         .rev()
 580         .find(|(_, c)| **c == b'.')
 581         .map(|(dot_index, _)| (&input[..dot_index], &input[dot_index + 1..]))
 582 }
 583
 584 /// Sanitise a file name with the default options.
 585 /// See [`Options`] for a description of what all the options do.
 586 ///
 587 /// The return value should be suitable as a file name, and will not be empty (if it *would* be
 588 /// empty, it’ll be `_` instead, per [`Options::six_measures_of_barley`]’s default).
 589 #[cfg(feature = "alloc")]
 590 #[cfg_attr(docsrs, doc(cfg(feature = "alloc")))]
 591 pub fn sanitise(s: &str) -> String {
 592     sanitise_with_options(s, &Options::DEFAULT)
 593 }
 594
 595 /// Calculate a sufficient allocation size for the string used. This number will never exceed
 596 /// `input.len() + 1 + options.reserve_extra`, and will be less on ridiculously long inputs.
 597 ///
 598 /// Only intended for use by crazy allocation-counters like me.
 599 pub fn sufficient_alloc_size<R: Replace>(input: &str, options: &Options<R>) -> usize {
 600     if options.length_limit < 10 {
 601         return 0;
 602     }
 603     if options.extension_cleverness {
 604         if let Some((base_name, extension)) = split_extension(input) {
 605             let extension_length_limit = options.length_limit - 6;
 606             let might_add_underscore = |n| {
 607                 if (n == 3 || n == 4) && options.windows_safe {
 608                     n + '_'.len_utf8()
 609                 } else {
 610                     n
 611                 }
 612             };
 613             return (
 614                 might_add_underscore(base_name.len()).min(options.length_limit) +
 615                 '.'.len_utf8() +
 616                 extension.len().min(extension_length_limit)
 617                 // No reserve_extra on this side because this is the size needed *while working*,
 618                 // but reserve_extra is only needed when we’re done.
 619             ).max(
 620                 might_add_underscore(input.len())
 621                 .min(options.length_limit)
 622                 .max(options.six_measures_of_barley.len()) +
 623                 options.reserve_extra
 624             )
 625         }
 626     }
 627     (input.len().min(options.length_limit) + if options.windows_safe { '_'.len_utf8() } else { 0 })
 628     .max(options.six_measures_of_barley.len()) + options.reserve_extra
 629 }
 630
 631 // Alas, <usize as Ord>::max isn’t const.
 632 const fn max(a: usize, b: usize) -> usize { if a > b { a } else { b } }
 633
 634 macro_rules! max_alloc_size_body {
 635     ($options:ident) => {{
 636         if $options.length_limit < 10 {
 637             return 0;
 638         }
 639         let baseline = max($options.length_limit, $options.six_measures_of_barley.len())
 640             + $options.reserve_extra;
 641         if $options.extension_cleverness {
 642             let extension_length_limit = $options.length_limit - 6;
 643             max($options.length_limit + '.'.len_utf8() + extension_length_limit, baseline)
 644         } else {
 645             baseline
 646         }
 647     }}
 648 }
 649
 650 #[cfg(not(feature = "const-fn-trait-bound"))]
 651 /// Calculate the maximum allocation size required for a given set of options, to correctly handle
 652 /// any input.
 653 ///
 654 /// This is intended for the scratch buffer approach, where you keep one string around and keep on
 655 /// sanitising a whole bunch of inputs into it in turn, or for array-allocated strings like with
 656 /// `tinyvec_string`.
 657 ///
 658 /// This is unfortunately not currently a const fn. If you need a const fn (e.g. to craft an
 659 /// precisely-sized `ArrayString`), you may:
 660 ///
 661 /// 1. Enable the `const-fn-trait-bound` feature on this crate (requires nightly rustc), which will
 662 ///    change this function to be const, or
 663 ///
 664 /// 2. Use [`max_alloc_size_const`] instead, which requires `R = Option<char>`.
 665 ///    (There’s also a `tinyvec_string` usage demonstration there.)
 666 pub fn max_alloc_size<R: Replace>(options: &Options<R>) -> usize {
 667     max_alloc_size_body!(options)
 668 }
 669
 670 #[cfg(feature = "const-fn-trait-bound")]
 671 /// Calculate the maximum allocation size required for a given set of options, to correctly handle
 672 /// any input.
 673 ///
 674 /// This is intended for the scratch buffer approach, where you keep one string around and keep on
 675 /// sanitising a whole bunch of inputs into it in turn, or for array-allocated strings like with
 676 /// `tinyvec_string`.
 677 ///
 678 /// This is a const fn because this crate was compiled with the `const-fn-trait-bound` feature
 679 /// enabled (which requires nightly rustc at the time of writing).
 680 ///
 681 /// See also [`max_alloc_size_const`] for an example of using this with `tinyvec_string`.
 682 pub const fn max_alloc_size<R: Replace>(options: &Options<R>) -> usize {
 683     max_alloc_size_body!(options)
 684 }
 685
 686 /// A `const` variant of [`max_alloc_size`].
 687 ///
 688 /// Sample usage, combined with `tinyvec_string` (with its `rustc_1_55` feature enabled):
 689 ///
 690 /// ```rust,ignore
 691 /// use tinyvec_string::ArrayString;
 692 /// let mut string =
 693 ///     ArrayString::<[u8; max_alloc_size_const(&Options::DEFAULT)]>::new();
 694 /// sanitise_to("input name", &Options::DEFAULT, &mut string);
 695 /// ```
 696 ///
 697 /// Once `const-fn-trait-bound` is stabilised, this method will be deprecated.
 698 #[cfg_attr(feature = "const-fn-trait-bound", doc = "\n \
 699     Since you compiled this crate with the `const-fn-trait-bound` feature, you don’t need this
 700     method. Be cheerful and use `max_alloc_size` instead!")]
 701 pub const fn max_alloc_size_const(options: &Options<Option<char>>) -> usize {
 702     max_alloc_size_body!(options)
 703 }
 704
 705 /// Sanitise a file name. See [`Options`] for a description of what all the options do.
 706 ///
 707 /// The return value should be suitable as a file name for the specified options,
 708 /// unless it’s empty which can only happen if the option `six_measures_of_barley` is empty (or if
 709 /// the `length_limit` option is illegally small, actually).
 710 #[cfg(feature = "alloc")]
 711 #[cfg_attr(docsrs, doc(cfg(feature = "alloc")))]
 712 pub fn sanitise_with_options<R: Replace>(input: &str, options: &Options<R>) -> String {
 713     let mut out = String::with_capacity(sufficient_alloc_size(input, options));
 714
 715     #[cfg(test)]
 716     let initial_capacity = out.capacity();
 717
 718     sanitise_to(input, options, &mut out);
 719
 720     #[cfg(test)]
 721     if initial_capacity != out.capacity() {
 722         // I’m serious about this making exactly one allocation. No reallocating allowed.
 723         panic!("Capacity changed from {initial_capacity} to {} (on {:?} → {:?})",
 724             out.capacity(), input, out);
 725     }
 726
 727     out
 728 }
 729
 730 /// A target for sanitisation: essentially the subset of `String` functionality used.
 731 ///
 732 /// It might have been nice to use something like `Read + Write + Seek` instead, but the need to
 733 /// delete things after writing means that you need still more, and in the end it’s much easier to
 734 /// treat it as a string.
 735 ///
 736 /// I’ve provided implementations for `String` (if the *alloc* feature is enabled, which it is by
 737 /// default) and `tinyvec_string::ArrayString` (if the *tinyvec_string* feature is enabled),
 738 /// but there’s nothing preventing you from implementing it on other similar string types.
 739 pub trait Stringy:
 740     Index<Range<usize>, Output = str> +
 741     Index<RangeFrom<usize>, Output = str> +
 742     Deref<Target = str> +
 743     Extend<char>
 744 {
 745     fn push(&mut self, ch: char);
 746     fn push_str(&mut self, string: &str);
 747     fn pop(&mut self) -> Option<char>;
 748     fn truncate(&mut self, new_len: usize);
 749     fn replace_range<R>(&mut self, range: R, replace_with: &str) where R: RangeBounds<usize>;
 750 }
 751
 752 #[cfg(feature = "alloc")]
 753 #[cfg_attr(docsrs, doc(cfg(feature = "alloc")))]
 754 impl Stringy for String {
 755     #[inline] fn push(&mut self, ch: char) { self.push(ch) }
 756     #[inline] fn push_str(&mut self, string: &str) { self.push_str(string) }
 757     #[inline] fn pop(&mut self) -> Option<char> { self.pop() }
 758     #[inline] fn truncate(&mut self, new_len: usize) { self.truncate(new_len) }
 759     #[inline] fn replace_range<R>(&mut self, range: R, replace_with: &str)
 760     where R: RangeBounds<usize>
 761     { self.replace_range(range, replace_with) }
 762 }
 763
 764 #[cfg(feature = "tinyvec_string")]
 765 #[cfg_attr(docsrs, doc(cfg(feature = "tinyvec_string")))]
 766 impl<A: tinyvec_string::bytearray::ByteArray> Stringy for tinyvec_string::ArrayString<A> {
 767     #[inline] fn push(&mut self, ch: char) { self.push(ch) }
 768     #[inline] fn push_str(&mut self, string: &str) { self.push_str(string) }
 769     #[inline] fn pop(&mut self) -> Option<char> { self.pop() }
 770     #[inline] fn truncate(&mut self, new_len: usize) { self.truncate(new_len) }
 771     #[inline] fn replace_range<R>(&mut self, range: R, replace_with: &str)
 772     where R: RangeBounds<usize>
 773     { self.replace_range(range, replace_with) }
 774 }
 775
 776 /// Sanitise a file name into an existing `String`. Intended for power users only.
 777 ///
 778 /// When you use [`sanitise`] or [`sanitise_with_options`], the perfect allocation is artisanally
 779 /// crafted (or something). If you use this carelessly, you may actually cause *more* allocations
 780 /// to be made, rather than less, or panic if `S` is a non-growable type (e.g.
 781 /// `tinyvec::ArrayString`). You may therefore wish to use [`sufficient_alloc_size`] or
 782 /// [`max_alloc_size`] or [`max_alloc_size_const`] in some cases to calculate how much more to
 783 /// reserve ahead of time.
 784 ///
 785 /// See [`Options`] for a description of what all the options do.
 786 ///
 787 /// After calling this, `out` will be the same length or longer, never shorter. If you want to know
 788 /// *how much* longer, store and compare the length yourself.
 789 pub fn sanitise_to<R: Replace, S: Stringy>(input: &str, options: &Options<R>, out: &mut S) {
 790     let protected = out.len();
 791     // I said in the docs don’t set it to less than 10, but without this zero leads to some
 792     // unreachable!() being reached, which is æsthetically displeasing, so I’m just going to return
 793     // empty strings for unreasonably small length limits. 🙂
 794     if options.length_limit < 10 {
 795         return;
 796     }
 797
 798     // When label-break-value stabilises I’ll switch to that, but until then, loop it is.
 799     #[allow(clippy::never_loop)]
 800     loop {  // breaks after exactly one iteration.
 801         if options.extension_cleverness {
 802             if let Some((base_name, extension)) = split_extension(input) {
 803                 // With extension-awareness, when the path exceeds length_limit, we prefer
 804                 // to truncate from the base name rather than from the extension. But we don’t
 805                 // know how much we’ll need to truncate until we’ve finished processing the
 806                 // extension, so we have to allocate a maximum of roughly twice as much as
 807                 // we’ll end up needing.
 808                 //
 809                 // For implementation convenience in this corner case, we’ve declared a maximum
 810                 // extension length of six less than length_limit (explained on
 811                 // Options::extension_cleverness).
 812                 let extension_length_limit = options.length_limit - 6;
 813                 sanitise_part(base_name, options, options.length_limit, false, out, protected);
 814                 let base_len = out.len() - protected;
 815                 out.push('.');
 816                 let extension_truncated = sanitise_part(
 817                     extension,
 818                     options,
 819                     extension_length_limit,
 820                     true,
 821                     out,
 822                     // It’s OK for trimming to take out the entire file name (so we’re deliberately
 823                     // not including base_len in this), but we mustn’t touch what’s not ours.
 824                     protected,
 825                 );
 826
 827                 let mut total_len = out.len() - protected;
 828
 829                 if total_len > options.length_limit {
 830                     if extension_truncated {
 831                         // Extension is unsalvageable: truncate from the end.
 832                         while total_len > options.length_limit {
 833                             match out.pop() {
 834                                 Some(c) => total_len -= c.len_utf8(),
 835                                 None => unreachable!(),
 836                             }
 837                         }
 838                         // Length is now acceptable, but that could have left us with
 839                         // undesirable trailing characters, so run trim again.
 840                         out.truncate(protected + trim_end(&out[protected..], options, true).len());
 841                     } else {
 842                         // (Sigh. Whose brilliant idea was it to try to preserve extensions anyway?
 843                         // Maybe if I’d realised the pain it’d cause I wouldn’t have bothered.
 844                         // It’s not like anyone *else* does it. But it is good, say I. And now I’m
 845                         // even mulling over grapheme-cluster-aware truncation. Am I mad?)
 846                         let base_name_end_index = base_len;
 847                         let mut base_chars = out[protected..protected + base_name_end_index].chars();
 848                         while total_len > options.length_limit {
 849                             match base_chars.next_back() {
 850                                 Some(c) => {
 851                                     total_len -= c.len_utf8();
 852                                 }
 853                                 None => unreachable!(),
 854                             }
 855                         }
 856                         let base_name = trim_end(
 857                             &out[protected..protected + base_chars.as_str().len()],
 858                             options,
 859                             false,
 860                         );
 861                         let range = protected + base_name.len()..protected + base_name_end_index;
 862                         out.replace_range(range, "");
 863                     }
 864                 }
 865
 866                 break;
 867             }
 868         }
 869         // Extension cleverness disabled, or no extension found: the much simpler path!
 870         sanitise_part(input, options, options.length_limit, false, out, protected);
 871         break;
 872     }
 873
 874     // Finally one last bit of processing: checking names that are all dots
 875     // (though normally windows_safe will already have truncated them to zero).
 876     if (options.url_safe && (&out[protected..] == "." || &out[protected..] == "..")) ||
 877         (options.most_fs_safe && out[protected..].chars().all(|c| c == '.'))
 878     {
 879         out.truncate(protected);
 880     }
 881
 882     if out[protected..].is_empty() {
 883         out.push_str(options.six_measures_of_barley);
 884     }
 885 }
 886
 887 fn sanitise_part<R: Replace, S: Stringy>(
 888     input: &str,
 889     options: &Options<R>,
 890     length_limit: usize,
 891     is_extension: bool,
 892     out: &mut S,
 893     protected: usize,
 894 ) -> bool {
 895     let mut len = 0;
 896     let mut did_truncate = false;
 897
 898     let mut last_was_remove = false;
 899     let mut last_was_whitespace = false;
 900     out.extend(input.chars()
 901         .map(|mut c| {
 902             c = if options.normalise_whitespace && c.is_whitespace() { ' ' } else { c };
 903             (c,
 904                 (options.most_fs_safe && !is_most_fs_safe_char(c)) ||
 905                 (options.windows_safe && !is_windows_safe_char(c)) ||
 906                 (options.url_safe && !is_url_safe_char(c)) ||
 907                 (options.remove_control_characters && c.is_control()) ||
 908                 (options.remove_reordering_characters && is_reordering_character(c))
 909             )
 910         })
 911         .filter_map(|(c, remove)| {
 912             if options.collapse_replacements {
 913                 if remove && last_was_remove {
 914                     return None;
 915                 }
 916                 last_was_remove = remove;
 917             }
 918             if remove { options.replace_with.replace(c) } else { Some(c) }
 919             .filter(|&c| {
 920                 if options.normalise_whitespace {
 921                     let is_whitespace = c == ' ';
 922                     let drop = last_was_whitespace && is_whitespace;
 923                     last_was_whitespace = is_whitespace;
 924                     !drop
 925                 } else {
 926                     true
 927                 }
 928             })
 929         })
 930         .skip_while(|&c| {
 931             (options.trim_spaces_and_full_stops && is_space_or_full_stop(c)) ||
 932             (options.trim_more_punctuation && is_more_punctuation_character(c))
 933         })
 934         .take_while(|&c| {
 935             let new_len = len + c.len_utf8();
 936             if new_len <= length_limit {
 937                 len = new_len;
 938                 true
 939             } else {
 940                 did_truncate = true;
 941                 false
 942             }
 943         })
 944     );
 945
 946     if len > 0 {
 947         // We’ve added something non-trimmed, that’ll guard the potential reserved name underscore.
 948         out.truncate(protected + trim_end(&out[protected..], options, is_extension).len());
 949     }
 950
 951     if !is_extension && options.windows_safe && is_reserved_windows_file_name(&out[protected..]) {
 952         // This underscore looks to be in danger of being end-trimmed,
 953         // but in practice we’ve ensured that it won’t be
 954         // (except maybe one case with a lower length_limit than permitted).
 955         out.push('_');
 956     }
 957
 958     // Whew. Finally done. Breathe a sigh of relief.
 959     did_truncate
 960 }
 961
 962 fn trim_end<'a, R: Replace>(out: &'a str, options: &Options<R>, is_extension: bool) -> &'a str {
 963     let trim_space_or_full_stop = options.trim_spaces_and_full_stops ||
 964         ((is_extension || !options.extension_cleverness) && options.windows_safe);
 965     out.trim_end_matches(|c| {
 966         (trim_space_or_full_stop && is_space_or_full_stop(c)) ||
 967         (options.trim_more_punctuation && is_more_punctuation_character(c))
 968     })
 969 }
 970
 971 // A concession to those poor Americans et al. 😀
 972 #[cfg(feature = "alloc")]
 973 #[cfg_attr(docsrs, doc(cfg(feature = "alloc")))]
 974 pub use sanitise as sanitize;
 975 #[cfg(feature = "alloc")]
 976 #[cfg_attr(docsrs, doc(cfg(feature = "alloc")))]
 977 pub use sanitise_with_options as sanitize_with_options;
 978 pub use sanitise_to as sanitize_to;
 979
 980 // How did this get to almost a thousand lines by this point? I’m sure I started out with only
 981 // twenty or so. But then I got careful about allocations, and added extension cleverness, and
 982 // added more features, and documented exhaustingly, and oops, a thousand lines, lines that are
 983 // convoluted at times. Well, I succeeded in all my *functional* goals, with better precision,
 984 // theoretically better but untested speed, better behaviour around extensions, single-allocation
 985 // and even *no*-allocation operation; but utterly lost sight of simple and obviously-correct code.
 986 // Was it worth it? Eh, probably.
 987
 988 // --- Tests ---
 989
 990 #[cfg(feature = "alloc")]
 991 #[test]
 992 fn test_length_limit_things() {
 993     // I wrote these tests before I wrote the matrix. I might as well delete them, but I haven’t.
 994
 995     let short = Options {
 996         length_limit: 10,
 997         ..Options::DEFAULT
 998     };
 999
1000     assert_eq!(sanitise_with_options("abcdef.ghij", &short), "abcde.ghij");
1001
1002     // Unsalvageable extension
1003     assert_eq!(sanitise_with_options("abcde.fghij", &short), "abcde.fghi");
1004
1005     // Windows reserved name protection
1006     assert_eq!(sanitise_with_options("AUX.abcdef", &short), "AUX_.abcd");
1007     assert_eq!(sanitise_with_options("AUX.abcdef", &Options { windows_safe: false, ..short }),
1008                "AUX.abcd");
1009     assert_eq!(sanitise_with_options("lpT7.abcdef", &short), "lpT7_.abcd");
1010     assert_eq!(sanitise_with_options("cOm6.abcdef", &Options { windows_safe: false, ..short }),
1011                "cOm6.abcd");
1012
1013     assert_eq!(sanitise("CON"), "CON_");
1014     assert_eq!(sanitise("aux.h"), "aux_.h");
1015     assert_eq!(sanitise("Lpt1.exe"), "Lpt1_.exe");
1016     assert_eq!(sanitise("xyz"), "xyz");
1017     assert_eq!(sanitise(""), "_");
1018     assert_eq!(sanitise("nül"), "nül");
1019     assert_eq!(sanitise("COM1.jpg.png"), "COM1.jpg.png");
1020 }
1021
1022 #[cfg(feature = "alloc")]
1023 #[test]
1024 fn matrix() {
1025     // Look, I know I said I didn’t want std, but I *need* it for these tests, y’see?
1026     #[cfg(not(feature = "std"))]
1027     extern crate std;
1028     use std::prelude::rust_2021::*;
1029     use std::fmt::Write;
1030     use std::{eprintln, println, format, vec};
1031
1032     fn case<
1033         R: Replace,
1034         #[cfg(feature = "tinyvec_string")]
1035         A: tinyvec_string::bytearray::ByteArray,
1036     >(
1037         set_name: &'static str,
1038         options_name: &'static str,
1039         options: &Options<R>,
1040         paths: &mut Vec<String>,
1041         unsteady_state: &mut Vec<(&'static str, &'static str, String, String, String)>,
1042         // Apparently you can’t do `case::<#[cfg] A>()`, so we have to do this instead.
1043         #[cfg(feature = "tinyvec_string")]
1044         _: std::marker::PhantomData<A>,
1045     ) {
1046         println!("Sanitising {set_name} with options {options_name}");
1047         #[cfg(feature = "tinyvec_string")]
1048         let mut array_string = tinyvec_string::ArrayString::<A>::new();
1049         let mut sanitised = String::new();
1050         let mut capacity = String::new();
1051         let mut scratch = if options_name == "passthrough" {
1052             // “memory allocation of 18446744073709551615 bytes failed” 😀
1053             String::new()
1054         } else {
1055             String::with_capacity(max_alloc_size(options) + 1)
1056         };
1057         let scratch_size = scratch.capacity();
1058         for input in std::fs::read_to_string(format!("tests/{set_name}.txt")).unwrap().lines() {
1059             let output = sanitise_with_options(input, options);
1060             // A couple of sanity checks make sense here.
1061             if output.len() > options.length_limit {
1062                 panic!(
1063                     "Input {input} sanitised to {output}, which at {len} is greater than the allowed {max}",
1064                     len = output.len(),
1065                     max = options.length_limit,
1066                 );
1067             }
1068             if options.windows_safe && is_reserved_windows_file_name(
1069                 options.extension_cleverness.then(|| &*output)
1070                     .and_then(split_extension)
1071                     .map(|(base, _)| base)
1072                     .unwrap_or(&output)
1073             ) {
1074                 panic!("Input {input} sanitised to {output}, which is a reserved Windows file name");
1075             }
1076
1077             // And ensure sanitise_to is working properly also.
1078             scratch.truncate(0);
1079             scratch.push('.');  // A trimmable character, and not six_measures_of_barley.
1080             sanitise_to(input, options, &mut scratch);
1081             assert_eq!(scratch.chars().next(), Some('.'));
1082             assert_eq!(scratch[1..], output);
1083             #[cfg(feature = "tinyvec_string")]
1084             {
1085                 if array_string.capacity() > 0 {
1086                     array_string.truncate(0);
1087                     array_string.push(' ');  // A trimmable character, and not six_measures_of_barley.
1088                     sanitise_to(input, options, &mut array_string);
1089                     assert_eq!(array_string.chars().next(), Some(' '));
1090                     assert_eq!(array_string[1..], output);
1091                 }
1092             }
1093
1094             sanitised.push_str(&output);
1095             sanitised.push('\n');
1096             let _ = writeln!(capacity, "{}", output.capacity());
1097             if input != output {
1098                 if options_name == "passthrough" {
1099                     unsteady_state.push(
1100                         (set_name, options_name, input.to_owned(), output.clone(), output),
1101                     );
1102                 } else {
1103                     let repeated = sanitise_with_options(&output, options);
1104                     if repeated != output {
1105                         sanitised.push_str("⚠ Sanitisation did not reach a steady state. Next line shows the effect of resanitising the line above. ⚠\n");
1106                         sanitised.push_str(&repeated);
1107                         sanitised.push('\n');
1108                         unsteady_state.push(
1109                             (set_name, options_name, input.to_owned(), output, repeated),
1110                         );
1111                     }
1112                 }
1113             }
1114         }
1115         let sanitised_name = format!("tests/{set_name}.{options_name}.sanitised");
1116         let capacity_name = format!("tests/{set_name}.{options_name}.capacity");
1117         std::fs::write(&sanitised_name, sanitised).unwrap();
1118         std::fs::write(&capacity_name, capacity).unwrap();
1119         paths.push(sanitised_name);
1120         paths.push(capacity_name);
1121         if options_name != "passthrough" {
1122             assert_eq!(scratch_size, scratch.capacity(), "scratch buffer reallocated");
1123         }
1124     }
1125
1126     let mut unsteady_state = vec![];
1127     let mut paths = vec![];
1128     let d = Options::DEFAULT;
1129     for name in ["blns", "misc"] {
1130         macro_rules! case {
1131             // On $array_size: I tried using roughly { max_alloc_size_const(&options) + 1 },
1132             // but threading it all through was just too painful, especially in the absence of
1133             // const-fn-trait-bound. So I’ll just do one separate test for that.
1134             ($array_size:literal, $options_name:expr, $options:expr) => {
1135                 let options = &$options;
1136                 // +1 for the ' ' we prefix.
1137                 let required_size = max_alloc_size(options).saturating_add(1);
1138                 assert!($array_size == 0 || required_size <= $array_size,
1139                     "Test case design error: array being given {} bytes, but {} are needed",
1140                     $array_size,
1141                     required_size
1142                 );
1143                 case(
1144                     name,
1145                     $options_name,
1146                     options,
1147                     &mut paths,
1148                     &mut unsteady_state,
1149                     // TODO: after https://github.com/ThatsNoMoon/tinyvec_string/issues/3 is
1150                     // resolved, ditch $array_size and use max_alloc_size instead.
1151                     #[cfg(feature = "tinyvec_string")]
1152                     std::marker::PhantomData::<[u8; $array_size]>,
1153                 );
1154             }
1155         }
1156         // Assumption that I decline to “test” because it’d be silly:
1157         // sanitise(…) == sanitise_with_options(…, &Options::DEFAULT).
1158         case!(512, "default", d);
1159         case!(512, "realistic-length_limit-reduction", Options { length_limit: Options::DEFAULT.length_limit - 4, ..d });
1160         case!(512, "url_safe", Options { url_safe: true, ..d });
1161         case!(512, "silly-replace_with", Options::DEFAULT.with_replace_with(|c| char::from_u32(c as u32 + 1)));
1162         case!(512, "no-windows_safe", Options { windows_safe: false, ..d });
1163         case!(256, "no-extension_cleverness", Options { extension_cleverness: false, ..d });
1164         // 10 + 1
1165         case!(11, "short-sans-extension_cleverness", Options { length_limit: 10, extension_cleverness: false, ..d });
1166         // 15 + 1
1167         case!(16, "short", Options { length_limit: 10, ..d });
1168         case!(0, "passthrough", Options {
1169             length_limit: usize::MAX,
1170             reserve_extra: 0,
1171             extension_cleverness: false,
1172             most_fs_safe: false,
1173             windows_safe: false,
1174             url_safe: false,
1175             normalise_whitespace: false,
1176             trim_spaces_and_full_stops: false,
1177             trim_more_punctuation: false,
1178             remove_control_characters: false,
1179             remove_reordering_characters: false,
1180             replace_with: None,
1181             collapse_replacements: false,
1182             six_measures_of_barley: "",
1183         });
1184         macro_rules! case_only {
1185             ($option:ident) => {{
1186                 let mut options = Options {
1187                     most_fs_safe: false,
1188                     windows_safe: false,
1189                     url_safe: false,
1190                     normalise_whitespace: false,
1191                     trim_spaces_and_full_stops: false,
1192                     trim_more_punctuation: false,
1193                     remove_control_characters: false,
1194                     remove_reordering_characters: false,
1195                     ..d
1196                 };
1197                 options.$option = true;
1198                 case!(512, concat!("just-", stringify!($option)), options);
1199             }}
1200         }
1201         case_only!(most_fs_safe);
1202         case_only!(windows_safe);
1203         case_only!(url_safe);
1204         case_only!(normalise_whitespace);
1205         case_only!(trim_spaces_and_full_stops);
1206         case_only!(trim_more_punctuation);
1207         case_only!(remove_control_characters);
1208         case_only!(remove_reordering_characters);
1209         // Eh, I’m bored now. That’ll do.
1210     }
1211
1212     let mut complain_of_unsteady_states = false;
1213     if !unsteady_state.is_empty() {
1214         for (set, options, original, first, second) in &unsteady_state {
1215             match (*set, *options, &**original, &**first, &**second) {
1216                 ("blns", "short", "Dr. Herman I. Libshitz", "Dr. Herman", "Dr.Herm") |
1217                 ("blns", "short", r#"{{ "".__class__.__mro__[2].__subclasses__()[40]("/etc/passwd").read() }}"#, "{{ __.__cl", "{{.cl") => {
1218                     // Skip known cases of unsalvageable extensions combining with dots in the base
1219                     // name to effectively give a new extension, making quiescence take two steps.
1220                     // Making these steady-state would take too much effort, and the harm is
1221                     // minimal (the unsteady state is still a correctly sanitised name).
1222                 },
1223                 (_, "silly-replace_with", _, _, _) => {
1224                     // Certainly this one isn’t steady-state!
1225                 },
1226                 (_, "passthrough", _, _, _) => {
1227                     complain_of_unsteady_states = true;
1228                     eprintln!("Unsteady state in {set} with {options} options, diff tests/{set}.{options}.txt and tests/{set}.{options}.sanitised");
1229                 },
1230                 _ => {
1231                     complain_of_unsteady_states = true;
1232                     eprintln!("Unknown unsteady state in {set} with {options} options, look for the ⚠ symbol in tests/{set}.{options}.sanitised");
1233                 },
1234             }
1235         }
1236     }
1237
1238     if !std::process::Command::new("git")
1239         .arg("diff")
1240         .arg("--exit-code")
1241         .arg("--text")
1242         .args(&paths)
1243         .status()
1244         .unwrap()
1245         .success()
1246     {
1247         panic!("sanitisation produced different results than are known, review the diffs");
1248     }
1249
1250     // A guard against committing an unsteady state.
1251     if complain_of_unsteady_states {
1252         panic!("Some sanitisations unexpectedly failed to reach a steady state.");
1253     }
1254 }
1255
1256 #[cfg(feature = "tinyvec_string")]
1257 #[test]
1258 #[should_panic]
1259 fn test_tinyvec_string_panic() {
1260     // I’ve already tested various normal cases, including that ridiculously long strings don’t
1261     // cause overflow on moderately limited arrays with moderate length limits; but I haven’t
1262     // demonstrated the panic that occurs if the array is too short. So here’s this now. 🙂
1263     sanitise_to(
1264         "Watch me panic!",
1265         &Options::DEFAULT,
1266         &mut tinyvec_string::ArrayString::<[u8; 12]>::new(),
1267     );
1268 }
1269
1270 #[cfg(feature = "tinyvec_string")]
1271 #[test]
1272 fn test_tinyvec_string_max_alloc_size() {
1273     use tinyvec_string::ArrayString;
1274     // Note: this is *currently* 505, but I declare that not part of the compatibility contract;
1275     // extension cleverness/grapheme cluster changes could lead to it increasing to 510.
1276     let mut string: ArrayString<[u8; 505]> =
1277         ArrayString::<[u8; max_alloc_size_const(&Options::DEFAULT)]>::new();
1278     assert_eq!(string.capacity(), 505);
1279     sanitise_to(
1280         "Watch everything being hunky dory, even when I throw unreasonably long values at it all. \
1281         Even when dots become extension separators; yes, even then. Then further: into the deep \
1282         reaches of testing, where things start to get garbled, and having written at least 255 \
1283         characters, I have to now write just as much of extension—horror. But that was a full \
1284         stop so that this could now be the extension, and I can’t put a dot for the next while› \
1285         ¿Whatever will I do? I suspect things are getting out of hand here, but I can’t stop now; \
1286         I’m running out of things to write, but it should be enough by now!",
1287         &Options::DEFAULT,
1288         &mut string,
1289     );
1290     assert_eq!(string, "Watch everything being hunky dory, even when I throw unreasonably long \
1291         values at it all. Even when dots become extension separators; yes, even then. Then \
1292         further_ into the deep reaches of testing, where things start to get garbled, and having \
1293         written at l");
1294 }