From 0000000d63631779a143778dc09275292ef8ed75 Mon Sep 17 00:00:00 2001 From: Chris Morgan Date: Wed, 13 Jul 2022 02:17:51 +1000 Subject: [PATCH 1/1] TESID 1.0.0, with Rust, Python, JS implementations Yes, I squashed all of the early history away. --- .gitignore | 8 + CHANGELOG | 12 + Makefile | 39 ++ README.md | 70 +++ algorithms.txt | 118 +++++ design-rationale.txt | 460 ++++++++++++++++ javascript/CHANGELOG.md | 7 + javascript/COPYRIGHT | 15 + javascript/README.md | 66 +++ javascript/package.json | 22 + javascript/tesid.d.ts | 196 +++++++ javascript/tesid.js | 178 +++++++ javascript/tests.js | 163 ++++++ javascript/typescript-tests.ts | 108 ++++ python/CHANGELOG.rst | 7 + python/COPYRIGHT | 15 + python/README.rst | 62 +++ python/pyproject.toml | 3 + python/setup.cfg | 28 + python/src/tesid/__init__.py | 315 +++++++++++ python/src/tesid/base32.py | 37 ++ python/src/tesid/fpeck.py | 53 ++ python/src/tesid/test/__init__.py | 14 + python/src/tesid/test/base32.py | 44 ++ python/src/tesid/test/fpeck.py | 16 + python/src/tesid/test/tesid.py | 118 +++++ rust/CHANGELOG.md | 7 + rust/COPYRIGHT | 17 + rust/Cargo.toml | 19 + rust/README.md | 73 +++ rust/examples/rejection_rates.rs | 98 ++++ rust/src/base32.rs | 169 ++++++ rust/src/fpeck.rs | 122 +++++ rust/src/inline_string.rs | 140 +++++ rust/src/lib.rs | 852 ++++++++++++++++++++++++++++++ rust/test | 14 + 36 files changed, 3685 insertions(+) create mode 100644 .gitignore create mode 100644 CHANGELOG create mode 100644 Makefile create mode 100644 README.md create mode 100644 algorithms.txt create mode 100644 design-rationale.txt create mode 100644 javascript/CHANGELOG.md create mode 100644 javascript/COPYRIGHT create mode 100644 javascript/README.md create mode 100644 javascript/package.json create mode 100644 javascript/tesid.d.ts create mode 100644 javascript/tesid.js create mode 100644 javascript/tests.js create mode 100644 javascript/typescript-tests.ts create mode 100644 python/CHANGELOG.rst create mode 100644 python/COPYRIGHT create mode 100644 python/README.rst create mode 100644 python/pyproject.toml create mode 100644 python/setup.cfg create mode 100644 python/src/tesid/__init__.py create mode 100644 python/src/tesid/base32.py create mode 100644 python/src/tesid/fpeck.py create mode 100644 python/src/tesid/test/__init__.py create mode 100644 python/src/tesid/test/base32.py create mode 100644 python/src/tesid/test/fpeck.py create mode 100644 python/src/tesid/test/tesid.py create mode 100644 rust/CHANGELOG.md create mode 100644 rust/COPYRIGHT create mode 100644 rust/Cargo.toml create mode 100644 rust/README.md create mode 100644 rust/examples/rejection_rates.rs create mode 100644 rust/src/base32.rs create mode 100644 rust/src/fpeck.rs create mode 100644 rust/src/inline_string.rs create mode 100644 rust/src/lib.rs create mode 100755 rust/test diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..f69119c --- /dev/null +++ b/.gitignore @@ -0,0 +1,8 @@ +/javascript/tesid-*.tgz +/javascript/tesid.min.js +/javascript/typescript-tests.js +/python/dist/ +/python/src/tesid.egg-info/ +/rust/Cargo.lock +/rust/target +__pycache__ diff --git a/CHANGELOG b/CHANGELOG new file mode 100644 index 00000000..9275e94 --- /dev/null +++ b/CHANGELOG @@ -0,0 +1,12 @@ +Changelog +========= + +This changelog is for TESID itself, as distinct from any *implementation* of TESID. + +1.0.0 (2022-07-14) +------------------ + +Initial release. No further releases are anticipated, +especially because almost any change will break compatibility, +and any upgrade path would be very likely to be messy. +But I choose to number the TESID scheme for robustness in case I do make a second version under the same name. diff --git a/Makefile b/Makefile new file mode 100644 index 00000000..7a1dc3d --- /dev/null +++ b/Makefile @@ -0,0 +1,39 @@ +.PHONY: help build test build-javascript build-python test-javascript test-rust test-python javascript-sizes + +help: + @echo "No default thing to make, please choose something (e.g. test)." + +build: build-javascript build-python + +test: test-javascript test-rust test-python + + +# Requires npm to be installed locally. +build-javascript: + cd javascript && \ + rm -f tesid-*.tgz tesid.min.js typescript-tests.js && \ + npm pack + +# Requires python and python-build to be installed locally. +build-python: + cd python && python -m build + + +# Requires npm and tsc to be installed locally. +test-javascript: + cd javascript && npm run test + +# Requires rustup, a nightly toolchain, clippy on the default toolchain, and cargo-msrv. +test-rust: + cd rust && ./test + +# Requires mypy to be installed locally. +test-python: + cd python/src && python -m unittest && mypy . + +# Requires npm and zopfli to be installed locally. +javascript-sizes: + @# See https://github.com/terser/terser/issues/496 about wrap_func_args=false; disabling a Terser misfeature to save two bytes. + npx terser --module --compress --mangle --format wrap_func_args=false --output javascript/tesid.min.js -- javascript/tesid.js + @echo "min = $$(wc -c . + +TESID is an algorithm that provides a solid way for you to store sequential identifiers in your database, +but expose cryptographically-secure pseudorandom but fairly short strings to users. +This is ideal for most situations where you have centralised ID allocation and don’t *actively want* sequential IDs, +stopping the often-important information leak of the ID sequence, +while still keeping IDs quite short and simple, +so that humans can still usefully interact with them. +In these circumstances, it’s most obviously a good alternative to UUIDs which are massive overkill and fairly inconvenient. + +What it does +------------ + +There are three steps to TESID: + +- First, if desired, it does type discrimination and sparsification, + by multiplying by a sparsity factor and adding a discriminant. + (This allows you to avoid TESID reuse or confusion across types.) + +- Secondly, it scrambles the number with a real cryptographic block cipher within certain ranges: + a 20-bit block size for values 0 to 2²⁰ − 1, 30-bit for values 2²⁰ to 2³⁰ − 1, + &c. through 40-bit, 50-bit, 60-bit, 70-bit, 80-bit, 90-bit and 100-bit. + +- Thirdly, it converts the scrambled number to a string using base conversion with a base-32 alphabet, + with leading padding as necessary to the appropriate length for its range: + 4 characters for 20-bit values, 6 for 30-, 8 for 40-, &c. until 20 for 100-bit values. + +The end result is that you get nice short IDs for as long as is possible, +but avoid exposing the numeric sequence. +(In the absence of sparsity and discrimination, +you’ll get about a million four-character TESIDs, +a billion six-, a trillion eight-, and so on.) + +See for a more detailed description. + +Examples +-------- + +Refer to the examples in the README of each subdirectory. + +Working with keys +----------------- + +TESID uses a 128-bit key for its cryptography; +libraries take this as a 32-character big-endian lowercase hexadecimal string. + +This key should be randomly generated. +Here are a few command-line techniques you can use: + +- `openssl rand -hex 16` +- `python -c 'import secrets; print(secrets.token_hex(16))'` +- ` str + byte.toString(16).padStart(2, "0"), ""))'` + +More reading +------------ + +- [More general information](https://chrismorgan.info/tesid/more/) +- [The algorithms](algorithms.txt) +- [Design rationale](design-rationale.txt) diff --git a/algorithms.txt b/algorithms.txt new file mode 100644 index 00000000..d1f9e22 --- /dev/null +++ b/algorithms.txt @@ -0,0 +1,118 @@ +======================= +The algorithms of TESID +======================= + +Here are prose descriptions of the TESID encoding and decoding algorithms, +including the pieces that make them up. + +This is the Table of Ranges, used in encoding and decoding: + +┌─────────────┬────────┬─────────┬─────────────┐ +│ Range │ Length │ 5peck n │ ~№ values │ +┝━━━━━━━━━━━━━┿━━━━━━━━┿━━━━━━━━━┿━━━━━━━━━━━━━┥ +│ [0, 2²⁰) │ 4 │ 10 │ 1 million │ +│ [2²⁰, 2³⁰) │ 6 │ 15 │ 1 billion │ +│ [2³⁰, 2⁴⁰) │ 8 │ 20 │ 1 trillion │ +│ [2⁴⁰, 2⁵⁰) │ 10 │ 25 │ Quadrillion │ +│ [2⁵⁰, 2⁶⁰) │ 12 │ 30 │ ⋮ │ +│ [2⁶⁰, 2⁷⁰) │ 14 │ 35 │ │ +│ [2⁷⁰, 2⁸⁰) │ 16 │ 40 │ │ +│ [2⁸⁰, 2⁹⁰) │ 18 │ 45 │ │ +│ [2⁹⁰, 2¹⁰⁰) │ 20 │ 50 │ │ +└─────────────┴────────┴─────────┴─────────────┘ + +(The *approximate number of values* column is purely informational, +not being used in the algorithm. It’s the size of the range, +showing how many IDs you can expect to encode at each length, +given sparsity=1 and discriminant=0.) + +See also , +which explores some of the reasoning behind TESID. + +5peck +===== + +Encryption is done with 5peck, a slight variant of Speck with different parameter values and a slightly different key expansion technique. + +Speck is defined in the paper +*The SIMON and SPECK Families of Lightweight Block Ciphers* +which can be found at . + +5peck is defined for word sizes n={10, 15, 20, 25, 30, 35, 40, 45, 50}. +In all cases, T=30, α=9, β=2, and m is not applicable. + +Key expansion is shared across all word sizes. +A single 128-bit key is expanded in the usual Speck way, +with parameters n=64, m=2, T=30, α=9, β=2. + +For 5peck encryption and decryption with word size n, +take the n least significant bits of each 64-bit round key. + +Thus, a single key and single expansion can be used for all sizes. + +Where Speck’s convention is to operate on byte sequences, +5peck overtly takes unsigned integers as its blocks, +with the least significant bits forming the first word (x), +and the most significant bits forming the second word (y), +following Speck’s endianness convention. + +Base-32 +======= + +Stringification is done by numeric base conversion with the following alphabet: + + 23456789abcdefghijkmnpqrstuvwxyz + +The decimal 311177248 encodes to “base32”, +as 9×32⁵ + 8×32⁴ + 24×32³ + 12×32² + 1×32¹ + 0×32⁰. + +Because TESID wants strings of certain lengths, +values are padded with leading zeroes (“2” in this alphabet). + +“base32” padded to length nine would be “222base32”. + +TESID encoding +============== + +Takes: + + • an *ID* (non-negative integer), + • an *expanded key*, + • a *sparsity* factor (positive integer, default 1), and + • a *discriminant* (non-negative integer, default 0). + +Method: + + 1. Multiply *ID* by *sparsity*, and add *discriminant*. + This is the *ID to encrypt*. + 2. Select the row in the Table of Ranges where the *ID to encrypt* lies within the row’s *range*. + Take the *length* and *n*. + If no row matches (that is, the ID to encrypt is 2¹⁰⁰ or higher), fail. + 3. Encrypt the ID with the 5peck cipher, + using the *expanded key* and *n*. + 4. Convert the encrypted ID to base-32, + padding to *length*. + This string is the encoded TESID. + +TESID decoding +============== + +Takes: + + • a string *TESID*, + • an *expanded key*, + • a *sparsity* factor (positive integer, default 1), and + • a *discriminant* (non-negative integer, default 0). + +Method: + + 1. Select the row in the Table of Ranges where *TESID*’s length matches. + Take the *range* and *n*. + If there is no matching row, fail. + 2. Convert *TESID* from base-32 into an integer. + 3. Decrypt this value with the 5peck cipher, + using the *expanded key* and *n*. + 4. If the decrypted value is not in *range*, fail. + 5. Divide the decrypted value by *sparsity*, and subtract *discriminant*. + If the division leaves a remainder, or if the subtraction goes negative, fail. + The result is the decoded TESID. diff --git a/design-rationale.txt b/design-rationale.txt new file mode 100644 index 00000000..091a012 --- /dev/null +++ b/design-rationale.txt @@ -0,0 +1,460 @@ +====================== +TESID design rationale +====================== + +TESID is made up of three parts: + + 2. Encryption, + 3. String encoding, and + 1. A convention for reducing occupation of the ID space + and attaching a discriminant for types or other such details. + +(I’ve numbered these items in their encoding order, +but I’m going to discuss them in this altered order. +Things would get boring if list item numbers always had to be in order.) + +See for a basic overview of it all. +This file explains why things are the way they are. + +2. 5peck: Speck, but with five-based word sizes and just one key +================================================================ + +Encryption is done with some new instances of the Speck cipher family, +which I have called 5peck (because it amused me). +5peck is pronounced “fpeck”, not to be confused with “ſpeck”. +(This also amuses me.) +In places where an identifier is required that cannot start with a numeral, I use fpeck. + +There are two particular properties of 5peck ciphers: + + • All instances share an expanded key, + taking as many bits as they require of each round key. + This is a significant deviation from Speck for small word sizes, + since there’s not much point in longer keys in usual lightweight block cipher situations, + due to other weaknesses like birthday attacks. + But such reasons are inapplicable to 5peck. + + • Word sizes are multiples of five, + in order to match the base-32 alphabet that follows (32 is five bits). + This is not normal because it’s more computationally expensive + with no real benefit for normal cryptographic purposes, + but there’s no reason not to do it when you have a reason like I do. + +5peck ciphers don’t get names of their own (unlike Speck32/64, Speck128/128, &c.). +Wouldn’t want them getting self-important or anything. + +In the reference implementation, all 5peck ciphers use a 64-bit machine word, +with appropriate rotation and wrapping addition and subtraction via masking, +but all that is implementation detail. + +For key scheduling: a 128-bit key is expanded with n=64, m=2, T=30, α=9, β=2. +Encryption takes the n least significant bits of each round key. + +5peck is defined for word sizes n={10, 15, 20, 25, 30, 35, 40, 45, 50}. +In all cases, T=30, α=9, β=2, and m is not applicable. + +The notable omission from the sequence is n=5, +which would produce 1024 two-character TESIDs. +I omitted it because it’s not *that* useful, +I’d need to support a different value of at least α, +and I was concerned about its integrity, +given how extremely it deviates from the pattern I see in the Speck paper. +(To reach 128-bit security, I think it’d need at least a few more rounds, +T=30 yielding only 150 bits in its expanded key.) +So instead I start at n=10 and four-character TESIDs. + +Justification for the single expanded key and T=30 +-------------------------------------------------- + +Probably the most significant deviation from Speck is the 128-bit key used all round: +does it or can it lead to 128-bit security, even at n=10? +And how many rounds do we need? + +These are the numbers of rounds for the defined Speck instances: + + • For n=16: T = 18 + m + • For n=24: T = 19 + m + • For n=32: T = 23 + m + • For n=48: T = 26 + m + • For n=64: T = 30 + m + +I’m going to generalise and call this T = w + m, +where w is very approximately proportional to n + c for some constant c. +n=24 is a mild outlier from my predictions: I’d have expected w=20, not w=19. +And assuming that w is *linearly* correlated to n is hubris. + +In the addition rather than multiplication of m, +it is clearly seen that security is not particularly strongly proportional to the number of round key bits; +Speck128/128 has 16 bits of round key per bit of security, +but Speck128/256 has only 8.5 bits of round key per bit of security, +and Speck32/64 has 5.5 bits of round key per bit of security. +You clearly can’t take this too far, as there’s an obvious hard lower bound of 1, +and imperfections in diffusion will mean that the real lower bound is higher. +(Skipping to the end state: +for n=10 I end up with 2.34 bits of round key per claimed bit of security, +which is not a huge amount of margin, but still sufficient, I think. +n=5 would have been 1.17, which I suspect would be insufficient.) + +(I’m treating the number of bits of security as equivalent to the number of bits of key, +because that is the intent of the cipher.) + +Well then, the number of rounds. +Or actually, the key expansion technique, that’s important too. + +Originally, I expanded the key for each value of n, +by consuming one word of the key until it was all gone +(plus however many extra zero bits were needed at the end); +this led to m=ceil(128 / n). +I also tentatively chose T=30, +leading to values of w from 17 (for n=10) to 27 (for n=50). +This seemed fine for a while, +but I wasn’t enthusiastic about having to store 2160 bytes of expanded keys +(as I was using u64 for all n), and n=10’s m=13 was rather a lot. +Then I thought of just expanding it once, with n=64, m=2, +and during encryption taking the n bits of each round key. +The purpose of key expansion is to diffuse the key bits into the round keys, +so this shouldn’t reduce security in any way. +And it gets the whole key into the mix sooner, +so it should lead to *increased* security in the round keys for small values of n. +This effectively reduces the very high values of m for small values of n, +so that my w is higher for them. +This suggests that I could probably reduce the number of rounds for smaller n. +A nice simple solution here is T = 20 + n/5, +which stays higher than Speck, but by much less. +The most important one to consider here is n=10: this takes it down to T=22, +meaning 220 bits of round key. +This is heading too low for my liking. +With this approach to key scheduling, +I suspect you may need more rounds to fit this much security in. + +(This is why I eliminated n=5: it’s hard to get such high security out of such a block size with this type of cipher; even n=10 may be pushing it.) + +And really, performance is sufficiently excellent that it’s not worth fussing about reducing rounds by 25%. +My Rust benchmark of encrypting all n=10 values (2²⁰ of them, around a million) +shows it taking under half a nanosecond for each one (under 0.5ms for all) on my laptop, +and this will hold for all block sizes, since all are using u64 machine words with masking. +So I’ll just leave it a flat 30 rounds, +which I’m fairly confidently is sufficient for 128-bit security at n=10, +and significant overkill for n=15 to n=40 or so. +I’ve run the numbers in a few different ways and am as confident in the results as I could hope to be as a crypto layman. + +Justification for α = 9, β = 2 +------------------------------ + +Speck’s decision for their rotation values is explained in +: +they started with (7, 2) and it worked well, +but switched to (8, 3), which was still good enough, +for better x86 performance due to SIMD byte-shuffle, +except for Speck32 where (8, 3) was too low-quality on a 16-bit word +and they didn’t want to increase the number of rounds to compensate. +They acknowledge that there are other values like (9, 2) that are better in a vacuum, +but that the attacks they handle better aren’t security-limiting attacks, +and that performance is worse on some platforms. + +In 5peck, word sizes are multiples of 5, +so the SIMD byte-shuffle argument fails, +as do the other performance arguments. +Intuition suggested to me that α − β = 5 might suffer slightly; +some basic cryptanalysis concurred and suggested (9, 2) works well. +(I use that for the n=64, m=2 key expansion too.) +In related news, it *looks* to me from some basic CryptoSMT analysis like multiples of five for n fare better than multiples of eight. +Not sure if I’m barking up the wrong tree of if that’s a real effect. +If you happen to be a cryptographer, I’d quite like to know more, for curiosity’s sake. + +Other remarks +------------- + +I do not know if my key reuse across ciphers weakens anything. +In principle, I don’t believe it should, +especially given that a particular value will only ever be encrypted with one value of n. + +Remember that even if I can figure out how to nudge cryptographers’ tools along, +I am not a professional cryptographer and really don’t know what I’m doing. +But I do think that the cryptography covered here is sound and robust, +to approximately 128 bits of security, +which is frankly pretty extreme overkill for ID scrambling. +*Even if* I have gravely miscalculated on n=10, +I don’t think there’s any way that it could reduce it below even 80 bits, +which would still be significant overkill for the applicable threat model. + +I’ve used quite a bit of logic and reasoning here, +which I imagine is a bad sign in cryptographic circles, +where it’s better to use concrete measurements. +I invite analysis by experts whose fancy is tickled. + +If I’ve scared you in all of this, I would like to reassure you: +I’ve been handling the cryptography carefully just out of principle, +and in the real world it doesn’t actually matter, +because in the intended use case of this library, +literally no currently‐known or hypothesised attack will *ever* be viable. +Your IDs are safe. + +3. Base-32 encoding +=================== + +I have chosen this alphabet: + + 23456789abcdefghijkmnpqrstuvwxyz + +That is, ASCII digits and lowercase letters, minus the following characters: + + • 0, as visually confusable with O and audially confusable with o + (since zero is extremely commonly pronounced “oh”, just like o); + + • 1, as visually confusable with l and I; + + • l, as visually confusable with 1 and I; and + + • o, as audially confusable with 0. + +For some purposes it might be nice to remove more: + + • i, as confusable in uppercase with l and 1; + + • 5 and s, as mutually confusable in handwriting and uppercase; + + • 2 and z, as mutually confusable in handwriting and uppercase; and + + • v and u, as mutually confusable in some fonts and handwriting. + +But with bit-level block sizes +(and without messing around with more esoteric format-preserving encryption, which I did try), +a power-of-two base yields the best results for performance, code size and compactness of representation, +and I didn’t want to go down to 16. +So base-32 it is, meaning five bits per character. + +An interesting consequence of the technical choices is that TESIDs are an even number of characters. +This means that you could even provide an *optional* check digit, if it took your fancy: +even-length string means no check digit, odd-length means check digit. +If you want a check digit, I suggest the Damm algorithm with an order-32 table. +At this time, I offer no assistance in this matter. + +When thinking of ID lengths: every five bits is one character, +or more practically (because of the block sizes used) every ten bits two characters. +(Speck can only support even block sizes, +so I couldn’t support odd-length strings without compromising my design a bit, +or finding another cipher as good that could support odd block sizes, +which I failed to do, though I did toy with Hasty Pudding for a while.) + +One final obvious question: why base-32, why not a different base? + + • To begin with, only ASCII is considered. Non-ASCII would be troublesome. + + • If convenient, a power of two is good, for its bit alignment, + leading to a consistent pattern of block sizes and lengths, + easier reasoning, and slightly improved efficiency of execution and length. + + • Base-64 is eliminated because it’s not sufficiently human-friendly: + + (a) it requires two non-alphanumeric characters in its alphabet, + which messes up wordwise selection as, of ASCII, + underscore is the only nonalphanumeric character that’s “word”. + + (b) confusable characters is a problem, + and mixed case an inconvenience. + + In its favour, exactly six bits per character *is* nice, + and its evenness means you could fully efficiently support all string lengths, + odd as well as even (with Speck n=3×length). + + • Base-62 (alphanumeric) solves the wordwise selection problem, + but doesn’t solve confusables or mixed case, + and loses the neatness of bit alignment. + + • In early TESID design I used Base58 (alphanumeric minus 0, I, O, l), + and was considering going down to base-57 (removing 1) + which still yielded 6-/11-character strings for the 32- and 64-bit block sizes I used at the time. + But a couple of people convinced me that case-sensitivity was undesirable, + and once I got into custom block sizes, + the difference in base doesn’t make such a difference in length. + + • Base-32 is pretty much as large as you can go while removing all usual confusables. + + • Going down perhaps as far as base-25 to eliminate more diverse confusables + (mentioned above) was tempting, but I like the neatness of 32. + + • Going down to base-16 (whether hexadecimal or something else) + *would* solve all the problems identified, + including even minimising concerns about undesirable patterns in strings, + but I’m not fond of the longer IDs it leads to: + a 40-bit value is 10 characters, + compared to base-32’s 8. + (And with a little sparsity or type discrimination, + a 40-bit value isn’t unreasonable.) + Also there’s something I can’t quite place my finger on where hexadecimal just feels… *programmery*. + The base is too small to feel right in some way I can’t properly express. + A larger base somehow ends up feeling more human. + (Don’t take from this that programmers are not human. + Unless it amuses you to do so, in which case I won’t try to stop you.) + + • Decimal is unsuitable as it tempts numeric treatment, + but leading zeroes are significant due to the cipher range overlaps. + Also it’s just misleading for people, because if they see numbers, + they kinda expect a sequence. + +If designing with a different alphabet, different block sizes would make sense. +If encoding to base-*b* and padding to length *l*, +the largest block size that can be used is ceil(log₂(b**l - b**(l - 1))) +(this formula includes rezeroing the block range, +which is useful for some values of b and l). +For l={1, 2, 3, …}, block sizes up to 64 bits: +b=16 yields block sizes 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64; +b=20 yields block sizes 0, 5, 9, 13, 18, 22, 26, 31, 35, 39, 44, 48, 52, 57, 61, 65; +b=25 yields block sizes 0, 5, 10, 14, 19, 24, 28, 33, 38, 42, 47, 52, 56, 61, 65; +b=32 yields block sizes 0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65; +b=58 yields block sizes 0, 6, 12, 18, 24, 30, 36, 41, 47, 53, 59, 65; +b=64 yields block sizes 0, 6, 12, 18, 24, 30, 36, 42, 48, 54, 60, 66. +But with Speck, block sizes must be multiples of two, +so any odd-numbered block sizes must be reduced by one. +(That is, a five-character string in base-32 could only go up to 2²⁴, not 2²⁵.) + +To my mind, the interesting variants of TESID here are: + + • Supporting odd-length strings, + which for base-32 means Speck n={2, 5, 7, 10, 12, 15, 17, 20, 22, …}. + Presuming you still started at four characters (n=10), + I think the values α=9 and β=2 would still be sound. + + • Going down to base-16, with Speck n={2, 4, 6, 8, 10, 12, …, 64}, + officially supporting the full u128 range of up to 32 characters, + though probably still starting at five characters (n=10). + If you wanted the first 65536 IDs to be less than five characters, + I’d suggest officially dropping below 128-bit security. + (Aside: n=2 is fundamentally incompatible with 128-bit security, + as there are too few possible permutations of a block: 2⁴! < 2¹²⁸.) + The fact that base-16 aligns with bytes is quite nice. + In the Rust implementation, since encode_long would no longer be fallible, + I think I’d drop the u64/u128 divide and go u128 throughout. + +Supporting odd-length strings does disclose more epochs, +but it doesn’t matter, especially not when you can tweak all that with⸺ + +1. Sparsity and discrimination +============================== + +This is a simple extra step that can be added before encryption, +in order to make valid IDs harder to guess, +and to avoid overlap between IDs of different types. + + id → id × sparsity + discriminant + +There are two purposes for sparsity: + +1. Reducing the occupation of the ID space. + If you don’t do this, then by the time you have a million or so IDs assigned (all the ones up to 2²⁰), + *every* four-character string made up of base-32 characters will correspond to a valid ID. + This can be undesirable, for example if you would like to use IDs as authentication, + or if you just plain don’t want valid IDs to be guessable. + + (“Using IDs as authentication”: that is, + having the ID is sufficient to access and/or mutate an entity. + This means things like magic login links, + and unlisted videos on a site like YouTube.) + + Given a 64-bit ID and 64-bit discriminant, the largest value that can safely be used for sparsity is 2³⁶ − 1. + I suggest using powers of two (assigning a certain number of bits), but you don’t have to. + If you allot 36 bits of sparsity (the most that can safely be done with 64-bit IDs, + as 36 + 64 = 100 which is the largest block size supported by the 5peck ciphers, + though in practice nothing will fill even 50 bits of that 64-bit ID, + so with care you can go higher than 36 if you want), + then only one in every 68 billion possible IDs will be valid, + and it’s very probably reasonable to use IDs as authentication. + With 36 bits of sparseness, you’d get 16 8-character IDs, + then 16384 10-character IDs, then 16 million 12-character IDs, &c. + +2. Avoiding ID reuse across types or tenants, by adding a discriminant. + Without this, a given ID number will produce the same TESID, + which could lead to type confusion, + allow discovery of valid IDs of different types + (or on different tenants, or whatever), + or disclose details about types’ sequences. + With this applied correctly and consistently, + TESIDs will effectively represent the encryption of the (type, id) tuple, + instead of just the ID number. + Because of this, a decoder can also determine which type an ID pertains to. + +If you are going to use TESID for more than one type, +I recommend adding bits for a type discriminant. + +As for how much sparsity to add, decide that yourself. +Every bit you add increases the amortised ID length by 0.2 characters. +(Or, every ten bits you add increases the ID length by exactly 2 characters.) + +Here’s a reference table for some possible values, +choosing powers of two (though you’re not limited to it): + +┌─────────────┬────────┬───────┬──────────┬────────────┬─────────────┬─────────────┬───┐ +│ Sparsity │ (Bits) │ Extra │ № 4-char │ № 6-char │ № 8-char │ № 10-char │ … │ +│ │ │ chars │ IDs │ IDs │ IDs │ IDs │ … │ +┝━━━━━━━━━━━━━┿━━━━━━━━┿━━━━━━━┿━━━━━━━━━━┿━━━━━━━━━━━━┿━━━━━━━━━━━━━┿━━━━━━━━━━━━━┿━━━┥ +│ 0 │ 0 │ 0 │ 1024 │ ~1 million │ ~1 billion │ ~1 trillion │ … │ +│ 32 │ 5 │ 1 │ 32 │ 32736 │ ~32 million │ ~32 billion │ … │ +│ 256 │ 8 │ 1.6 │ 4 │ 4092 │ ~4 million │ ~4 billion │ … │ +│ 1024 │ 10 │ 2 │ 1 │ 1023 │ ~1 million │ ~1 billion │ … │ +│ 65536 │ 16 │ 3.2 │ 0 │ 16 │ 16368 │ ~16 million │ … │ +│ 4294967296 │ 32 │ 6.4 │ 0 │ 0 │ 256 │ 261888 │ … │ +│ 68719476735 │ 35.99… │ 7.2 │ 0 │ 0 │ 16 │ 16368 │ … │ +└─────────────┴────────┴───────┴──────────┴────────────┴─────────────┴─────────────┴───┘ + +Values above 8 are not particularly likely to be useful for type discrimination, +but could be for other forms of discrimination (e.g. shared tenantry), +or for pure sparseness (especially for authenticatory IDs). + +The discriminant should generally be smaller than sparsity; +otherwise the ID reuse problem will occur, +and type-unaware decoding will no longer be able to retrieve the discriminant in full. +But this is not checked, partly for reasons of efficiency, +and partly because an overly-large discriminant can be used legitimately, +as a simple offset to lengthen IDs (though sparsity and skipping ID 0 is probably generally a better tool). + +I’ve introduced the concept of typed TESID coders to the libraries. +TESID offers no assistance in maintaining discriminant values for your types; +that you must maintain yourself. + +It’s possible to achieve these effects in the database, +by adjusting the start (discriminant) and step (sparseness) of the sequence. +TESID provides this functionality anyway for three reasons: + +(a) Most databases don’t provide the tools to readily adjust sequence stride; + +(b) Your database is probably limited to 63 bits, + but TESID performs these operations in 100-bit space, + allowing sparseness and discrimination even on implausibly large ID values; and + +(c) You may be applying TESID to an existing sequence. + +—⁂— + +I identify three main parameter value groups: + +• Big sparsity, no or small discriminant: longer, much less guessable IDs. + Go far enough and this can be reasonable for authenticated: + at 2³⁰, you’re adding six characters to the string length, + and only one in a billion IDs will be valid. + And you can go further still if you really want. + +• Big discriminant, no sparsity: just longer IDs, but fully guessable once you have enough. + If you go big enough, this can give the perfect illusion of a fixed-length string: + with a u64 ID, you can safely use 2⁶⁰ as a discriminant and get alwayss-14-character TESIDs; + these will *in practice* be quite suitable for use as authentication. + But you have to be careful in choosing the big discriminant to use, + since the strings *are* still variable-width, + and full subranges are perfectly guessable. + All up, I don’t generally recommend this form. + +• Small sparsity, type discriminant: slightly longer and slightly less guessable IDs. + This is a good choice for most purposes. + +(When I say “small sparsity”, I suggest values like 32, 256 or 1024. +You don’t *have* to line up with bits, but I mildly prefer to.) + +—⁂— + +Sequence ideas: + +• Auto-incrementing database ID. This is the obvious and most likely use. + +• Time. If you used a unix timestamp in seconds, you will get 8-character IDs until the year 36812 which is hopefully enough for you. (Times before 2004-01-11T00:37:04 will be six characters, and from 1970-01-01T00:00:00 to 1970-01-13T13:16:15 four.) If you use YYYYMMDDHHMMSS, you’ll get 10-character IDs; subtract 2000 from the year and go with YYMMDDHHMMSS, and you’re back to 8 characters until the end of 2219. + + Practical example that I am thinking of using: private but sharable (that is, the URL is the only authentication) pages on my website where I add a random token to the slug using the publication date. diff --git a/javascript/CHANGELOG.md b/javascript/CHANGELOG.md new file mode 100644 index 00000000..8e8eebc --- /dev/null +++ b/javascript/CHANGELOG.md @@ -0,0 +1,7 @@ +Changelog +========= + +1.0.0 (2022-07-14) +------------------ + +Initial release. Implements TESID 1.0.0. diff --git a/javascript/COPYRIGHT b/javascript/COPYRIGHT new file mode 100644 index 00000000..725fa78 --- /dev/null +++ b/javascript/COPYRIGHT @@ -0,0 +1,15 @@ +Copyright © 2022 Chris Morgan + +This project is distributed under the terms of two different licenses, +at your choice: + +- Blue Oak Model License 1.0.0: https://blueoakcouncil.org/license/1.0.0 +- ISC License: https://opensource.org/licenses/ISC + +If you do not have particular cause to select the ISC license, Chris Morgan +recommends that you select BlueOak-1.0.0, which is better and arguably simpler +than ISC, which is only offered due to its greater recognition and popular use +in the Node.js ecosystem. (BlueOak-1.0.0 was only published in March 2019.) + +When using this code, ensure you comply with the terms of at least one of +these licenses. diff --git a/javascript/README.md b/javascript/README.md new file mode 100644 index 00000000..74e89ed --- /dev/null +++ b/javascript/README.md @@ -0,0 +1,66 @@ +# tesid: JavaScript package for Textualised Encrypted Sequential Identifiers + +This package implements the TESID scheme for producing short, human-friendly, +cryptographically-secure pseudorandom strings out of sequential identifiers. + +See for more information about TESID. + +A simple demonstration: + +```javascript +import { TESIDCoder } from "tesid"; + +const secretKey = "000102030405060708090a0b0c0d0e0f"; +const coder = new TESIDCoder(secretKey); + +console.assert(coder.encode(0n) == "w2ej"); +console.assert(coder.encode(1n) == "w6um"); +console.assert(coder.encode(2n) == "x45g"); +console.assert(coder.encode(2n**20n - 1) == "atcw"); +console.assert(coder.encode(2n**20n) == "8qwm6y"); +console.assert(coder.encode(2n**30n - 1) == "3eipc7"); +console.assert(coder.encode(2n**30n) == "n3md95r4"); +console.assert(coder.encode(2n**100n - 1) == "ia2bvpjaiju7g5uaxn5t"); +// And coder.encode(2n**100n) would throw RangeError. + +console.assert(coder.decode("w2ej") == 0); +``` + +```typescript +// And with the most convenient form of type discrimination, here using TypeScript features: +// (You can also do all this through TESIDCoder, but it’s less convenient.) +import { TypedTESIDCoder } from "tesid"; + +enum Type { + A = 0, + B = 1, + C = 2, +} +const typedCoder = new TypedTESIDCoder(coder, 256, Type); + +console.assert(typedCoder.encode(Type.A, 0) == "w2ej"); +console.assert(typedCoder.encode(Type.B, 0) == "w6um"); +console.assert(typedCoder.encode(Type.C, 0) == "x45g"); +console.assert(typedCoder.encode(Type.A, 1) == "dh2h"); +console.assert(typedCoder.encode(Type.B, 1) == "a6xy"); +console.assert(typedCoder.encode(Type.C, 1) == "7xgj"); +console.assert(typedCoder.decode(Type.A, "w2ej") == 0); +// typedCoder.decode(Type.B, "w2ej") would throw RangeError. +console.assert(typedCoder.splitDecode("w2ej").id == 0); +console.assert(typedCoder.splitDecode("w2ej").discriminant == Type.A); +``` + +This implementation is done as a single file with encryption and base conversion inlined for performance and code size. +Minified with Terser with all functionality retained, it’s 1680 bytes (915 gzipped with zopfli). +(Stripped to its bones—no TypedTESIDCoder, no splitDecode, no sparsity or discriminant, and slightly worse error reporting—it’s under 1200/700.) + +Type definitions and documentation are provided in tesid.d.ts. + +## Colophon + +**Authorship:** [Chris Morgan](https://chrismorgan.info/) is the author and maintainer of this library. + +**Licensing:** this library is distributed under the terms of the +[Blue Oak Model License 1.0.0](https://blueoakcouncil.org/license/1.0.0) and the +[ISC License](https://opensource.org/licenses/ISC), at your choice. +See [COPYRIGHT](COPYRIGHT) for details. diff --git a/javascript/package.json b/javascript/package.json new file mode 100644 index 00000000..ce85384 --- /dev/null +++ b/javascript/package.json @@ -0,0 +1,22 @@ +{ + "name": "tesid", + "version": "1.0.0", + "description": "Textualised Encrypted Sequential Identifiers", + "keywords": ["id", "autoincrement"], + "homepage": "https://chrismorgan.info/tesid/", + "type": "module", + "license": "(BlueOak-1.0.0 OR ISC)", + "author": "Chris Morgan ", + "main": "tesid.js", + "types": "tesid.d.ts", + "scripts": { + "test": "node tests.js && tsc --target es2020 typescript-tests.ts && node typescript-tests.js" + }, + "repository": { + "type": "git", + "url": "https://git.chrismorgan.info/tesid" + }, + "engines": { + "node": ">=10.4" + } +} diff --git a/javascript/tesid.d.ts b/javascript/tesid.d.ts new file mode 100644 index 00000000..bd97af3 --- /dev/null +++ b/javascript/tesid.d.ts @@ -0,0 +1,196 @@ +/** + * The TESID coder. + */ +export class TESIDCoder { + /** + * Initialise a TESID coder. + * + * The key string must be made up of exactly 32 lowercase hexadecimal (0-9a-f) characters, + * and should have been generated randomly. + * Refer to external documentation for information on key generation. + * + * @example + * new TESIDCoder("000102030405060708090a0b0c0d0e0f") + * + * @param key - A string of exactly 32 lowercase hexadecimal (0-9a-f) characters + * @throws {RangeError} if key is the wrong length or malformed + */ + constructor(key: string); + + /** + * Encode an ID. + * + * @example + * let coder = new TESIDCoder("000102030405060708090a0b0c0d0e0f"); + * + * // Simple values + * console.assert(coder.encode(0) === "w2ej"); + * + * // Tagging demonstration (first defining constants for consistent use): + * const TYPE_SPARSITY = 256; // meaning up to 256 possible types + * const TYPE_A = 0; + * const TYPE_B = 1; + * const TYPE_C = 2; + * + * // id 0 with three different discriminants/tags: + * console.assert(coder.encode(0, TYPE_SPARSITY, TYPE_A) === "w2ej"); + * console.assert(coder.encode(0, TYPE_SPARSITY, TYPE_B) === "w6um"); + * console.assert(coder.encode(0, TYPE_SPARSITY, TYPE_C) === "x45g"); + * + * // id 1 with three different discriminants/tags: + * console.assert(coder.encode(1, TYPE_SPARSITY, TYPE_A) === "dh2h"); + * console.assert(coder.encode(1, TYPE_SPARSITY, TYPE_B) === "a6xy"); + * console.assert(coder.encode(1, TYPE_SPARSITY, TYPE_C) === "7xgj"); + * + * @param id - A big integer in the range [0, 2¹⁰⁰) + * @param sparsity - A positive integer to multiply id by (default 1, which doesn’t change id) + * @param discriminant - A non-negative integer to add to id after sparsening (default 0, which doesn’t change id) + * @returns A string of 4–20 base-32 characters, of even length + * @throws {TypeError} Parameter id must be a bigint (number is not accepted because of its unsuitability beyond 2⁵³) + * @throws {RangeError} (id * sparsity + discriminant) must be in the range [0, 2¹⁰⁰) + */ + encode(id: bigint, sparsity?: bigint, discriminant?: bigint): string; + + /** + * Decode an ID. + * + * @example + * let coder = new TESIDCoder("000102030405060708090a0b0c0d0e0f"); + * + * // Simple values + * console.assert(coder.decode("w2ej") === 0); + * + * // If sparsity and/or discriminant were used on encode, matching values + * // must be provided here, or else it will fail to decode: + * const TYPE_SPARSITY = 256; + * const TYPE_A = 0; + * const TYPE_B = 1; + * const TYPE_C = 2; + * console.assert(coder.decode("w2ej", TYPE_SPARSITY, TYPE_A) === 0); + * // And coder.decode("w2ej", TYPE_SPARSITY, TYPE_C) throws RangeError + * + * @param tesid - A TESID string, which should be 4–20 characters long + * @param sparsity - A positive integer id was multiplied by (default 1, which doesn’t change id) + * @param discriminant - A non-negative integer that was added to id after sparsening (default 0, which doesn’t change id) + * @returns The decoded ID + * @throws {RangeError} if decoding fails in any way + */ + decode(tesid: string, sparsity?: bigint, discriminant?: bigint): bigint; + + /** + * Decode an ID that was encoded with certain sparsity, + * separating the discriminant and returning it alongside the ID. + * + * This is useful if you want to accept various discriminants; + * one simple use case is better error reporting: + * “that’s an ID for type A, but this takes IDs for type B”. + * + * This allows *you* to identify the discriminant, + * but due to the encryption, anyone who has only the ID cannot; + * if you want users to be able to discern the discriminant, + * consider adding a human-friendly prefix to the ID; + * I like a single uppercase letter or a word followed by an underscore. + * + * This requires that the discriminant be less than the sparsity, + * or incorrect values will be produced. + * + * @param tesid - A TESID string, which should be 4–20 characters long + * @param sparsity - A positive integer id was multiplied by + * @returns The decoded ID and discriminant + * @throws {RangeError} if decoding fails in any way + */ + splitDecode(tesid: string, sparsity: bigint): {id: bigint, discriminant: bigint}; +} + +// This is approximately a supertype for TypeScript enums. +// The technique has the unfortunate property that, given `D extends Enum`, +// `D[keyof D]` will accept numbers (*any* numbers) as well as members of the enum, +// but I don’t think we can do any better with generics in TypeScript at this time. +type Enum = {[index: number]: string}; + +/** + * A TESID coder with type discrimination baked in. + * + * The type enum needs to provide a mapping from integers to strings, and strings to integers. + * + * If you find this type handling doesn’t quite satisfy you in any way, + * look at the code for this class: it’s a handful of lines of obvious code, + * so you can just duplicate it and apply your own slant. + * + * @example Using a TypeScript enum; methods’ examples start with this foundation. + * enum Type { + * A = 0, + * B = 1, + * C = 2, + * } + * const coder = new TypedTESIDCoder(new TESIDCoder("000102030405060708090a0b0c0d0e0f"), 256n, Type); + * + * @example Using plain JavaScript, the equivalent to the previous example. + * const Type = { + * A: 0, + * B: 1, + * C: 2, + * 0: "A", + * 1: "B", + * 2: "C", + * }; + * const coder = new TypedTESIDCoder(new TESIDCoder("000102030405060708090a0b0c0d0e0f"), 256n, Type); + */ +export class TypedTESIDCoder { + /** + * Initialise a typed TESID coder. + * + * This takes a `TESIDCoder` (rather than a key) so that you can share a coder, + * if you don’t always use the one sparsity and type enum. + * + * `sparsity` must exceed the highest variant in `typeEnum`. + * + * @param coder - The TESID coder to use + * @param sparsity - A positive integer to multiply id by, which must exceed the highest variant in `typeEnum` + * @param typeEnum - An object for mapping discriminant names and values, using regular JavaScript indexing syntax + */ + constructor(coder: TESIDCoder, sparsity: bigint, typeEnum: D); + + /** + * Encode an ID and type. + * + * @example + * console.assert(coder.encode(Type.A, 0n) == "w2ej"); + * console.assert(coder.encode(Type.B, 0n) == "w6um"); + * console.assert(coder.encode(Type.A, 1n) == "dh2h"); + * + * @param type - The discriminant to use + * @param id - A big integer in the range [0, 2¹⁰⁰) + * @returns A string of 4–20 base-32 characters, of even length + * @throws {TypeError} Parameter type must be a string that is a valid key of the type enum + * @throws {TypeError} Parameter id must be a bigint (number is not accepted because of its unsuitability beyond 2⁵³) + * @throws {RangeError} (id * sparsity + discriminant) must be in the range [0, 2¹⁰⁰) + */ + encode(type: D[keyof D], id: bigint): string; + + /** + * Decode an ID and type. + * + * @example + * console.assert(coder.decode(Type.A, "w2ej") == 0n); + * console.assert(coder.decode(Type.B, "w6um") == 0n); + * console.assert(coder.decode(Type.A, "dh2h") == 1n); + * // And coder.decode(Type.A, "w5um") throws RangeError + * + * @param type - The discriminant to use + * @param tesid - A TESID string, which should be 4–20 characters long + * @throws {TypeError} Parameter type must be a string that is a valid key of the type enum + * @throws {RangeError} if decoding fails in any way + * @returns The decoded ID + */ + decode(type: D[keyof D], tesid: string): bigint; + + /** + * Decode an ID but separate and return its discriminant too. + * + * @param tesid - A TESID string, which should be 4–20 characters long + * @returns The decoded ID and discriminant; the discriminant is of the enum type, a string and not a number. + * @throws {RangeError} if decoding fails in any way (including if the discriminant doesn’t correspond to a variant of the type enum) + */ + splitDecode(tesid: string): {id: bigint, discriminant: D[keyof D]}; +} diff --git a/javascript/tesid.js b/javascript/tesid.js new file mode 100644 index 00000000..9532ce4 --- /dev/null +++ b/javascript/tesid.js @@ -0,0 +1,178 @@ +const ALPHABET = "23456789abcdefghijkmnpqrstuvwxyz"; +const ALPHABET_INVERSE = [ + , , , , , , , , , , , , , , , , + , , , , , , , , , , , , , , , , + , , , , , , , , , , , , , , , , + , , 0n, 1n, 2n, 3n, 4n, 5n, 6n, 7n, , , , , , , + , , , , , , , , , , , , , , , , + , , , , , , , , , , , , , , , , + , 8n, 9n, 10n, 11n, 12n, 13n, 14n, 15n, 16n, 17n, 18n, , 19n, 20n, , + 21n, 22n, 23n, 24n, 25n, 26n, 27n, 28n, 29n, 30n, 31n, +]; +// (Yes, this JavaScript library gets worse error reporting than the Rust, +// because the only *real* reason for doing it so fancily in Rust was WrongDiscriminant, +// and that kind of data stapling to errors just doesn’t work well in JavaScript, +// as you’re heading outside the regular type system if using exceptions, +// which I think is most sensible, for fitting in with local conventions. +// There is also a bit of code size argument to it, but that’s secondary. +// If you want better error handling on wrong discriminant, use splitDecode.) +const throwRangeError = (msg = "bad TESID") => { + throw new RangeError(msg); +}; + +export class TESIDCoder { + constructor(key) { + if (!/^[0-9a-f]{32}$/.test(key)) { + throwRangeError("key must be 32 hex characters"); + } + key = new DataView(Uint8Array.from( + key.match(/../g).map(b => parseInt(b, 16)) + ).buffer); + let mask = (1n << 64n) - 1n; + let y = key.getBigUint64(8); + let x = key.getBigUint64(0); + let k = this.k = [y]; + let i = 0; + while (i < 29) { + x = ((((x << 55n) | (x >> 9n)) + y) & mask) ^ BigInt(i); + k[++i] = y = (((y << 2n) | (y >> 62n)) & mask) ^ x; + } + } + + encode(id, sparsity = 1n, discriminant = 0n) { + // Apply discriminant and sparsity, and check range. + id = (id * sparsity) + discriminant; + // Do you know how tempting it was to reuse the names `sparsity` and `discriminant` after this to save a few more bytes? + let n = id < 0n ? 0n : + id < (1n << 20n) ? 10n : + id < (1n << 30n) ? 15n : + id < (1n << 40n) ? 20n : + id < (1n << 50n) ? 25n : + id < (1n << 60n) ? 30n : + id < (1n << 70n) ? 35n : + id < (1n << 80n) ? 40n : + id < (1n << 90n) ? 45n : + id < (1n << 100n) ? 50n : + 0n; + if (!n) { + throwRangeError("id out of range"); + } + + // 5peck encrypt + let k = this.k; + let mask = (1n << n) - 1n; + let x = id & mask; + let y = (id >> n) & mask; + let i = 0; + let buf = ""; + while (i < 30) { + x = ((((x << (n - 9n)) | (x >> 9n)) + y) ^ k[i++]) & mask; + y = (((y << 2n) | (y >> (n - 2n))) & mask) ^ x; + } + id = (y << n) | x; + + // Base-32 encode + while (id) { + buf = ALPHABET[id & 31n] + buf; + id >>= 5n; + } + return buf.padStart(Number(n) * 0.4, "2" /* ALPHABET[0] */); + } + + decode(tesid, sparsity = 1n, discriminant = 0n) { + // Length check + let n = tesid.length; + let id = 0n; + let i = 0; + if (n % 2 || n < 4 || n > 20) { + throwRangeError(); + } + + // Base-32 decode + while (i < n) { + let c = ALPHABET_INVERSE[tesid.charCodeAt(i++)]; + if (c == undefined) { + throwRangeError(); + } + id = (id << 5n) | c; + } + + // 5peck decrypt + n = BigInt(n * 2.5); // Done now rather than earlier because odd-length strings lead to non-integral n and BigInt() would throw a RangeError with a less friendly message + let mask = (1n << n) - 1n; + let x = id & mask; + let y = (id >> n) & mask; + let k = this.k; + for (i = 30; i--;) { + y ^= x; + y = ((y << (n - 2n)) | (y >> 2n)) & mask; + x = ((x ^ k[i]) - y) & mask; + x = ((x << 9n) | (x >> (n - 9n))) & mask; + } + id = (y << n) | x; + + if ( + // Overly-long-encoding range check + (n > 10 && id < (1n << (n * 2n - 10n))) || + // Discriminant and sparsity deapply + (id -= discriminant) < 0 || + id % sparsity + ) { + throwRangeError(); + } + return id / sparsity; + } + + splitDecode(tesid, sparsity) { + // (Variable reuse to save the “let” keyword, just ’cos.) + tesid = this.decode(tesid); + return { + id: tesid / sparsity, + discriminant: tesid % sparsity, + }; + } +} + +export class TypedTESIDCoder { + constructor(coder, sparsity, typeEnum) { + this.c = coder; + this.s = sparsity; + this.t = typeEnum; + } + + encode(type, id) { + return this.c.encode(id, this.s, BigInt(type)); + } + + decode(type, tesid) { + return this.c.decode(tesid, this.s, BigInt(type)); + } + + splitDecode(tesid) { + // (Simple variable reuse, as before.) + tesid = this.c.splitDecode(tesid, this.s); + // 1. Gotta convert to a Number as that’s what typeEnum uses + // (otherwise strict equality comparison (===) would fail). + // Assumption here is that the type enum has no variants exceeding 2⁵³ − 1. + // If you do… wow. I’m not even going to try to rein you in. + // You clearly *want* things to blow up. Don’t blame me when they do, + // and someone accesses an entity by two different IDs + // (one with discriminant 2⁵³, one with discriminant 2⁵³ + 1). + // + // 2. Gotta check it’s a valid variant, or type unsoundness would occur, + // most notably seen in errors in exhaustiveness checking. + // (Yeah, encode/decode take a typeEnum variant without checking, + // but that’s fine as they don’t *produce* a value of that type.) + // + // (Could have made typeEnum optional, + // and then you could just use a bunch of constants instead of an object, + // but I don’t think the unlocked functionality is *particularly* useful, + // and Python and Rust can’t achieve it since they use actual enum types rather than just labelling numbers, + // so all up I’ve decided it’s better to insist on it.) + if (!((tesid.discriminant = Number(tesid.discriminant)) in this.t)) { + throwRangeError(); + } + + return tesid; + } +} diff --git a/javascript/tests.js b/javascript/tests.js new file mode 100644 index 00000000..fd26ed0 --- /dev/null +++ b/javascript/tests.js @@ -0,0 +1,163 @@ +import { TESIDCoder } from "./tesid.js"; + +function assert_eq(a, b, msg) { + if ((a == null ? a : a.toString()) !== (b == null ? b : b.toString())) { + throw new Error( + "assert_eq failed" + (msg ? ": " + msg : "") + + "\n left: " + a + + "\nright: " + b + ); + } +} + +function assert_ne(a, b, msg) { + if ((a == null ? a : a.toString()) === (b == null ? b : b.toString())) { + throw new Error( + "assert_ne failed" + (msg ? ": " + msg : "") + + "\n left: " + a + + "\nright: " + b + ); + } +} + +function assert_throws(f, msg_or_type) { + let x; + try { + x = f(); + } catch (e) { + if (msg_or_type != null) { + if (typeof msg_or_type == "string") { + if (e.toString() != msg_or_type) { + throw new Error("Expected exception `" + msg_or_type + "`, but found exception `" + e.toString() + "`"); + } + } else { + if (!(e instanceof msg_or_type)) { + throw new Error("Expected exception of type " + msg_or_type.name + ", but found exception `" + e.toString() + "`"); + } + } + } + return; + } + throw new Error("Expected exception " + + (msg_or_type == null ? "" : + typeof msg_or_type == "string" ? "`" + msg_or_type + "`" : + "of type " + msg_or_type.name) + + "\nCode did not fail, but returned " + (x == null ? x : x.toString())); +} + +function c(coder, number, string) { + assert_eq(coder.encode(number), string); + assert_eq(coder.decode(string), number); +} + +let coder = new TESIDCoder("00000000000000000000000000000000"); +c(coder, 0n , "4kcc"); +c(coder, 2n**20n - 1n, "3rck"); +c(coder, 2n**20n , "ju2sgs"); +c(coder, 2n**30n - 1n, "zangyh"); +c(coder, 2n**30n , "2aux4u3h"); +c(coder, 2n**40n - 1n, "3cd7rc4h"); +c(coder, 2n**40n , "m8669y33k6"); +c(coder, 2n**50n - 1n, "45e9rbrvvu"); +c(coder, 2n**50n , "t47yf553iv8t"); +c(coder, 2n**60n - 1n, "cwd8t75epzje"); +c(coder, 2n**60n , "86hk4d8hj4yvcy"); +c(coder, 2n**64n - 1n, "sirnf2k2d2m3bm"); +c(coder, 2n**70n - 1n, "m77g4ezr3e8qay"); +c(coder, 2n**70n , "43xf2jj6r6qm8bw4"); +c(coder, 2n**80n - 1n, "6h3wb7wytjr5tbrd"); +c(coder, 2n**80n , "4vumjq33d8iiwaharq"); +c(coder, 2n**90n - 1n, "qd7s3csnc5yfrrud5t"); +c(coder, 2n**90n , "jd3vsipfn69ia72chuvx"); +c(coder, 2n**100n - 1n, "628fg5kyid3z2vf2j4tf"); + +coder = new TESIDCoder("000102030405060708090a0b0c0d0e0f"); +c(coder, 0n , "w2ej"); +c(coder, 2n**20n - 1n, "atcw"); +c(coder, 2n**20n , "8qwm6y"); +c(coder, 2n**30n - 1n, "3eipc7"); +c(coder, 2n**30n , "n3md95r4"); +c(coder, 2n**40n - 1n, "nnz4z5qb"); +c(coder, 2n**40n , "st9fvria97"); +c(coder, 2n**50n - 1n, "qt42fug7hq"); +c(coder, 2n**50n , "dykqxtu2ieqi"); +c(coder, 2n**60n - 1n, "h7rhnw6tfhun"); +c(coder, 2n**60n , "xb5c8isevin9i3"); +c(coder, 2n**64n - 1n, "t62mijffzuvu4e"); +c(coder, 2n**70n - 1n, "n6n8jq6ike9dnj"); +c(coder, 2n**70n , "zk9d3setywjf7uwu"); +c(coder, 2n**80n - 1n, "bqqei5vmzkqjfru3"); +c(coder, 2n**80n , "z83vvq5u84sit9g7pd"); +c(coder, 2n**90n - 1n, "cpawgn8snjvverxvmp"); +c(coder, 2n**90n , "397btwmkh5y7sjz2xu82"); +c(coder, 2n**100n - 1n, "ia2bvpjaiju7g5uaxn5t"); + +// Encoding bad range or type +assert_throws(() => coder.encode(-1n), "RangeError: id out of range"); +assert_throws(() => coder.encode(2n**100n), "RangeError: id out of range"); +assert_throws(() => coder.encode(0), TypeError); // “can't convert 0 to a BigInt” or similar. + +// Test misencodings: 0 is w2ej, but if 0 were encrypted with n=15 +// instead of n=10, it’d be m2eig5—so that value isn’t allowed. +assert_throws(() => coder.decode("m2eig5"), "RangeError: bad TESID"); + +// … but slightly changed values are probably valid (since only one in +// 2¹⁰ is invalid). +assert_eq(coder.decode("m2eig6"), 473063752); + +// Also a few more at the boundaries for confidence: +// 2²⁰−1 but encoded with n=15 instead of n=10 +assert_throws(() => coder.decode("vf5fem"), "RangeError: bad TESID"); +// 2³⁰−1 but encoded with n=20 instead of n=15 +assert_throws(() => coder.decode("ixs6h9ma"), "RangeError: bad TESID"); +// 2³⁰−1 but encoded with n=50 instead of n=10 +assert_throws(() => coder.decode("uhkprgrirp45pe54twsa"), "RangeError: bad TESID"); + +// Bad string lengths +assert_throws(() => coder.decode(""), "RangeError: bad TESID"); +assert_throws(() => coder.decode("2"), "RangeError: bad TESID"); +assert_throws(() => coder.decode("22"), "RangeError: bad TESID"); +assert_throws(() => coder.decode("222"), "RangeError: bad TESID"); +assert_throws(() => coder.decode("22222"), "RangeError: bad TESID"); +assert_throws(() => coder.decode("2222222222222222222"), "RangeError: bad TESID"); +assert_throws(() => coder.decode("222222222222222222222"), "RangeError: bad TESID"); + +// … but just so it’s clear, ones are fine, it was just the lengths that were wrong. +c(coder, 173734n, "2222"); +c(coder, 592178178n, "222222"); +c(coder, 111515659577240532774228475483n, "22222222222222222222"); + +// Now time for some tagging. +function c2(coder, number, string, sparsity, discriminant, split_assert) { + assert_eq(coder.encode(number, sparsity, discriminant), string); + assert_eq(coder.decode(string, sparsity, discriminant), number); + let s = coder.splitDecode(string, sparsity); + split_assert(discriminant, s.discriminant); + split_assert(number, s.id); +} + +c2(coder, 0n, "w2ej", 1n, 0n, assert_eq); +c2(coder, 0n, "w6um", 1n, 1n, assert_ne); +c2(coder, 0n, "x45g", 1n, 2n, assert_ne); + +c2(coder, 0n, "w2ej", 100n, 0n, assert_eq); +c2(coder, 0n, "w6um", 100n, 1n, assert_eq); +c2(coder, 0n, "x45g", 100n, 2n, assert_eq); +c2(coder, 1n, "ypbn", 100n, 0n, assert_eq); +c2(coder, 1n, "k9pw", 100n, 1n, assert_eq); +c2(coder, 1n, "b7nc", 100n, 2n, assert_eq); +c2(coder, 2n, "r9yc", 100n, 0n, assert_eq); +c2(coder, 2n, "arf2", 100n, 1n, assert_eq); +c2(coder, 2n, "z6wh", 100n, 2n, assert_eq); +c(coder, 0n, "w2ej"); +c(coder, 1n, "w6um"); +c(coder, 2n, "x45g"); +c(coder, 100n, "ypbn"); +c(coder, 101n, "k9pw"); +c(coder, 102n, "b7nc"); +c(coder, 200n, "r9yc"); +c(coder, 201n, "arf2"); +c(coder, 202n, "z6wh"); +// The highest sparsity that’s always valid given 64-bit discriminant and id (which is the case in the Rust implementation): 2³⁶ − 1 +c2(coder, 2n**64n-1n, "fjwz5jk3p4gz9aqes22e", (1n<<36n) - 1n, 2n**64n-1n, assert_ne); +c(coder, 1267650600228229401427983728640n, "fjwz5jk3p4gz9aqes22e"); diff --git a/javascript/typescript-tests.ts b/javascript/typescript-tests.ts new file mode 100644 index 00000000..e72be83 --- /dev/null +++ b/javascript/typescript-tests.ts @@ -0,0 +1,108 @@ +import { TypedTESIDCoder, TESIDCoder } from "./tesid.js"; + +function assert_eq(a: any, b: any, msg?: string) { + if ((a == null ? a : a.toString()) !== (b == null ? b : b.toString())) { + throw new Error( + "assert_eq failed" + (msg ? ": " + msg : "") + + "\n left: " + a + + "\nright: " + b + ); + } +} + +function assert_ne(a: any, b: any, msg?: string) { + if ((a == null ? a : a.toString()) === (b == null ? b : b.toString())) { + throw new Error( + "assert_ne failed" + (msg ? ": " + msg : "") + + "\n left: " + a + + "\nright: " + b + ); + } +} + +function assert_throws(f: () => any, msg_or_type: string | typeof Error) { + let x: any; + try { + x = f(); + } catch (e) { + if (msg_or_type != null) { + if (typeof msg_or_type == "string") { + if (e.toString() != msg_or_type) { + throw new Error("Expected exception `" + msg_or_type + "`, but found exception `" + e.toString() + "`"); + } + } else { + if (!(e instanceof msg_or_type)) { + throw new Error("Expected exception of type " + msg_or_type.name + ", but found exception `" + e.toString() + "`"); + } + } + } + return; + } + throw new Error("Expected exception " + + (msg_or_type == null ? "" : + typeof msg_or_type == "string" ? "`" + msg_or_type + "`" : + "of type " + msg_or_type.name) + + "\nCode did not fail, but returned " + (x == null ? x : x.toString())); +} + +// First with a true TypeScript enum +(function() { + enum Type { + A = 0, + B = 1, + C = 2, + } + const coder = new TypedTESIDCoder(new TESIDCoder("000102030405060708090a0b0c0d0e0f"), 256n, Type); + + assert_eq(coder.encode(Type.A, 0n), "w2ej"); + assert_eq(coder.encode(Type.B, 0n), "w6um"); + assert_eq(coder.encode(Type.A, 1n), "dh2h"); + assert_eq(coder.encode(0, 1n), "dh2h"); + assert_eq(coder.encode(3, 1n), "r6wb"); // Unfortunately accepted by TypeScript; and deliberately permitted by our code. + // @ts-expect-error TS2345: Argument of type 'string' is not assignable to parameter of type 'Type'. + assert_throws(() => coder.encode(Type[Type.A], 1n), SyntaxError); // “Cannot convert A to a BigInt” or similar. (SyntaxError instead of TypeError is acceptable to me.) + enum WrongType { A = 0, B = 1, C = 2 } + // @ts-expect-error TS2345: Argument of type 'WrongType.A' is not assignable to parameter of type 'Type'. + coder.encode(WrongType.A, 0n); + + assert_eq(coder.decode(Type.A, "w2ej"), 0n); + assert_eq(coder.decode(Type.B, "w6um"), 0n); + assert_eq(coder.decode(Type.A, "dh2h"), 1n); + assert_eq(coder.decode(3, "r6wb"), 1n); + assert_throws(() => coder.decode(Type.A, "w5um"), RangeError); + // … but although encode and decode *accepted* an invalid variant, splitDecode mustn’t *produce* an invalid variant. + assert_throws(() => coder.splitDecode("r6wb"), "RangeError: bad TESID"); + let sd = coder.splitDecode("w6um"); + // (Doing *strict* equality checking this time.) + assert_eq(sd.id === 0n, true); + assert_eq(sd.discriminant === Type.B, true); +})(); + +// With a JavaScript-style enum +(function() { + const Type = { + A: 0, + B: 1, + C: 2, + 0: "A", + 1: "B", + 2: "C", + }; + const coder = new TypedTESIDCoder(new TESIDCoder("000102030405060708090a0b0c0d0e0f"), 256n, Type); + + assert_eq(coder.encode(Type.A, 0n), "w2ej"); + assert_eq(coder.encode(Type.B, 0n), "w6um"); + assert_eq(coder.encode(Type.A, 1n), "dh2h"); + assert_eq(coder.encode(0, 1n), "dh2h"); + assert_eq(coder.encode(3, 1n), "r6wb"); // Unfortunately accepted by TypeScript; and deliberately permitted by our code. + // Note no compile-time TS2345 error this time—it resolves to string | number. Pity, but can’t really do better. + assert_throws(() => coder.encode(Type[Type.A], 1n), SyntaxError); // “Cannot convert A to a BigInt” or similar. (SyntaxError instead of TypeError is acceptable to me.) + + assert_eq(coder.decode(Type.A, "w2ej"), 0n); + assert_eq(coder.decode(Type.B, "w6um"), 0n); + assert_eq(coder.decode(Type.A, "dh2h"), 1n); + assert_eq(coder.decode(3, "r6wb"), 1n); + assert_throws(() => coder.decode(Type.A, "w5um"), RangeError); + // … but although encode and decode *accepted* an invalid variant, splitDecode mustn’t *produce* an invalid variant. + assert_throws(() => coder.splitDecode("r6wb"), "RangeError: bad TESID"); +})(); diff --git a/python/CHANGELOG.rst b/python/CHANGELOG.rst new file mode 100644 index 00000000..8e8eebc --- /dev/null +++ b/python/CHANGELOG.rst @@ -0,0 +1,7 @@ +Changelog +========= + +1.0.0 (2022-07-14) +------------------ + +Initial release. Implements TESID 1.0.0. diff --git a/python/COPYRIGHT b/python/COPYRIGHT new file mode 100644 index 00000000..84c94fc --- /dev/null +++ b/python/COPYRIGHT @@ -0,0 +1,15 @@ +Copyright © 2022 Chris Morgan + +This project is distributed under the terms of two different licenses, +at your choice: + +- Blue Oak Model License 1.0.0: https://blueoakcouncil.org/license/1.0.0 +- MIT License: https://opensource.org/licenses/MIT + +If you do not have particular cause to select the MIT license, Chris Morgan +recommends that you select BlueOak-1.0.0, which is better and arguably simpler +than MIT, which is only offered due to its greater recognition and popular use +in the Python ecosystem. (BlueOak-1.0.0 was only published in March 2019.) + +When using this code, ensure you comply with the terms of at least one of +these licenses. diff --git a/python/README.rst b/python/README.rst new file mode 100644 index 00000000..74e092b --- /dev/null +++ b/python/README.rst @@ -0,0 +1,62 @@ +====================================================================== +tesid: Python library for Textualised Encrypted Sequential Identifiers +====================================================================== + +This package implements the TESID scheme for producing short, human-friendly, +cryptographically-secure pseudorandom strings out of sequential identifiers. + +See https://chrismorgan.info/tesid/ for more information about TESID. + +A simple demonstration: + +.. code:: python + + from tesid import TESIDCoder + + secret_key = '000102030405060708090a0b0c0d0e0f' + coder = TESIDCoder(secret_key) + + assert coder.encode(0) == 'w2ej' + assert coder.encode(1) == 'w6um' + assert coder.encode(2) == 'x45g' + assert coder.encode(2**20 - 1) == 'atcw' + assert coder.encode(2**20) == '8qwm6y' + assert coder.encode(2**30 - 1) == '3eipc7' + assert coder.encode(2**30) == 'n3md95r4' + assert coder.encode(2**100 - 1) == 'ia2bvpjaiju7g5uaxn5t' + # coder.encode(2**100) would raise ValueError. + assert coder.decode('w2ej') == 0 + + # And with the most convenient form of type discrimination: + # (You can also do all this through TESIDCoder, but it’s less convenient.) + from tesid import TypedTESIDCoder, SplitDecode + from enum import Enum + + class Type(Enum): + A = 0 + B = 1 + C = 2 + + typed_coder = TypedTESIDCoder(coder, 256, Type) + + assert typed_coder.encode(Type.A, 0) == 'w2ej' + assert typed_coder.encode(Type.B, 0) == 'w6um' + assert typed_coder.encode(Type.C, 0) == 'x45g' + assert typed_coder.encode(Type.A, 1) == 'dh2h' + assert typed_coder.encode(Type.B, 1) == 'a6xy' + assert typed_coder.encode(Type.C, 1) == '7xgj' + assert typed_coder.decode(Type.A, 'w2ej') == 0 + # typed_coder.decode(Type.B, 'w2ej') will raise ValueError. + assert typed_coder.split_decode('w2ej') == \ + SplitDecode(id=0, discriminant=Type.A) + +Colophon +======== + +**Authorship:** `Chris Morgan `_ is the author and +maintainer of this library. + +**Licensing:** this library is distributed under the terms of the +`Blue Oak Model License 1.0.0 `_ and +the `MIT License `_, at your choice. +See COPYRIGHT_ for details. diff --git a/python/pyproject.toml b/python/pyproject.toml new file mode 100644 index 00000000..b0f0765 --- /dev/null +++ b/python/pyproject.toml @@ -0,0 +1,3 @@ +[build-system] +requires = ["setuptools>=42"] +build-backend = "setuptools.build_meta" diff --git a/python/setup.cfg b/python/setup.cfg new file mode 100644 index 00000000..ea9e1bf --- /dev/null +++ b/python/setup.cfg @@ -0,0 +1,28 @@ +[metadata] +name = tesid +version = 1.0.0 +author = Chris Morgan +author_email = py@chrismorgan.info +description = Textualised Encrypted Sequential Identifiers +long_description = file: README.rst +long_description_content_type = text/x-rst +url = https://chrismorgan.info/tesid/ +license = Blue Oak Model License 1.0.0 (BlueOak-1.0.0) OR MIT License (MIT) +license_files = COPYRIGHT +classifiers = + Development Status :: 5 - Production/Stable + Intended Audience :: Developers + License :: OSI Approved :: MIT License + Operating System :: OS Independent + Programming Language :: Python :: 3 + Typing :: Typed +keywords = id, autoincrement + +[options] +package_dir = + = src +packages = find: +python_requires = >=3.6 + +[options.packages.find] +where = src diff --git a/python/src/tesid/__init__.py b/python/src/tesid/__init__.py new file mode 100644 index 00000000..843bced --- /dev/null +++ b/python/src/tesid/__init__.py @@ -0,0 +1,315 @@ +r""" +TESID: Textualised Encrypted Sequential Identifiers + +Example +======= + +>>> from tesid import TESIDCoder +>>> secret_key = '000102030405060708090a0b0c0d0e0f' +>>> coder = TESIDCoder(secret_key) +>>> coder.encode(0) +'w2ej' +>>> coder.encode(1) +'w6um' +>>> coder.encode(2) +'x45g' +>>> coder.encode(2**20 - 1) +'atcw' +>>> coder.encode(2**20) +'8qwm6y' +>>> coder.encode(2**30 - 1) +'3eipc7' +>>> coder.encode(2**30) +'n3md95r4' +>>> coder.encode(2**100 - 1) +'ia2bvpjaiju7g5uaxn5t' +>>> coder.encode(2**100) +Traceback (most recent call last): + ... +ValueError: id out of range +>>> coder.decode('w2ej') +0 + +""" + +from typing import List, NamedTuple, TypeVar, Generic, cast +from enum import Enum +from . import base32, fpeck + +__all__ = ['TESIDCoder', 'TypedTESIDCoder'] + + +TDiscriminant = TypeVar('TDiscriminant') +class SplitDecode(Generic[TDiscriminant]): + __slots__ = 'id', 'discriminant' + id: int + discriminant: TDiscriminant + + def __init__(self, id: int, discriminant: TDiscriminant): + self.id = id + self.discriminant = discriminant + + def __repr__(self): + return f'SplitDecode(id={self.id!r}, discriminant={self.discriminant!r})' + + def __eq__(self, other): + return self.id == other.id and self.discriminant == other.discriminant + + +class TESIDCoder: + """ + The TESID coder. + + >>> from tesid import TESIDCoder + >>> coder = TESIDCoder('000102030405060708090a0b0c0d0e0f') + + And for tagging, defining constants is good practice (though look at + ``TypedTESIDCoder`` if you’re doing this kind of discrimination): + + >>> TYPE_SPARSITY = 256 # meaning up to 256 possible types + >>> TYPE_A = 0 + >>> TYPE_B = 1 + >>> TYPE_C = 2 + + (Methods’ examples start with this foundation.) + """ + + expanded_key: List[int] + + def __init__(self, key: str): + """ + Initialise a TESID coder. + + The key string must be made up of exactly 32 lowercase hexadecimal + (0-9a-f) characters, and should have been generated randomly. + Refer to external documentation for information on key generation. + """ + if key.isupper() or len(key) != 32: + raise ValueError('key must be 32 lowercase hex characters') + + self.expanded_key = fpeck.expand(int(key, 16)) + + def encode(self, id: int, *, sparsity: int = 1, discriminant: int = 0) -> str: + """ + Encode an ID. + + Raises ValueError if ``id * sparsity + discriminant`` + is not in the range [0, 2¹⁰⁰). + + >>> coder.encode(0) + 'w2ej' + + You can use sparsity and discriminant for things like type tagging: + + >>> coder.encode(0, sparsity=TYPE_SPARSITY, discriminant=TYPE_A) + 'w2ej' + >>> coder.encode(0, sparsity=TYPE_SPARSITY, discriminant=TYPE_B) + 'w6um' + >>> coder.encode(0, sparsity=TYPE_SPARSITY, discriminant=TYPE_C) + 'x45g' + >>> coder.encode(1, sparsity=TYPE_SPARSITY, discriminant=TYPE_A) + 'dh2h' + >>> coder.encode(1, sparsity=TYPE_SPARSITY, discriminant=TYPE_B) + 'a6xy' + >>> coder.encode(1, sparsity=TYPE_SPARSITY, discriminant=TYPE_C) + '7xgj' + + """ + id = id * sparsity + discriminant + + i = (None if id < 0 else + 0 if id < 2**20 else + 1 if id < 2**30 else + 2 if id < 2**40 else + 3 if id < 2**50 else + 4 if id < 2**60 else + 5 if id < 2**70 else + 6 if id < 2**80 else + 7 if id < 2**90 else + 8 if id < 2**100 else None) + + if i is None: + raise ValueError('id out of range') + + return base32.encode( + fpeck.encrypt(self.expanded_key, (i + 2) * 5, id), + (i + 2) * 2, + ) + + def decode(self, tesid: str, *, sparsity: int = 1, discriminant: int = 0) -> int: + """ + Decode an ID. + + Raises ValueError if anything goes wrong. + + >>> coder.decode('w2ej') + 0 + >>> coder.decode('invalid') + Traceback (most recent call last): + ... + ValueError: invalid TESID (wrong length) + + If sparsity and/or discriminant were used on encode, matching values + must be provided here, or else it will fail to decode: + + >>> coder.decode('w2ej', sparsity=TYPE_SPARSITY, discriminant=TYPE_A) + 0 + >>> coder.decode('w2ej', sparsity=TYPE_SPARSITY, discriminant=TYPE_C) + Traceback (most recent call last): + ... + ValueError: invalid TESID (wrong discriminant or sparsity) + + """ + n = (len(tesid) // 2) * 5 + if len(tesid) % 2 or not (10 <= n <= 50): + raise ValueError('invalid TESID (wrong length)') + id = base32.decode(tesid) + if id is None: + raise ValueError('invalid TESID (wrong characters)') + id = fpeck.decrypt(self.expanded_key, n, id) + + # Overly-long-encoding range check + if n > 10 and id < 2**(n * 2 - 10): + raise ValueError('invalid TESID (overly long encoding)') + + id -= discriminant + if id < 0 or id % sparsity: + raise ValueError('invalid TESID (wrong discriminant or sparsity)') + return id // sparsity + + def split_decode(self, tesid: str, sparsity: int) -> SplitDecode[int]: + """ + Decode an ID that was encoded with certain sparsity, + separating the discriminant and returning it alongside the ID. + + This is useful if you want to accept various discriminants; + one simple use case is better error reporting: + “that’s an ID for type A, but this takes IDs for type B”. + + This allows *you* to identify the discriminant, + but due to the encryption, anyone who has only the ID cannot; + if you want users to be able to discern the discriminant, + consider adding a human-friendly prefix to the ID; + I like a single uppercase letter or a word followed by an underscore. + + This requires that the discriminant be less than the sparsity, + or incorrect values will be produced. + + Demonstration: + + >>> coder.split_decode('w2ej', TYPE_SPARSITY) + SplitDecode(id=0, discriminant=0) + >>> coder.split_decode('w6um', TYPE_SPARSITY) + SplitDecode(id=0, discriminant=1) + >>> coder.split_decode('x45g', TYPE_SPARSITY) + SplitDecode(id=0, discriminant=2) + >>> coder.split_decode('dh2h', TYPE_SPARSITY) + SplitDecode(id=1, discriminant=0) + >>> coder.split_decode('a6xy', TYPE_SPARSITY) + SplitDecode(id=1, discriminant=1) + >>> _.id + 1 + >>> coder.split_decode('7xgj', TYPE_SPARSITY) + SplitDecode(id=1, discriminant=2) + >>> _.discriminant + 2 + + """ + id = self.decode(tesid) + return SplitDecode(id=id // sparsity, discriminant=id % sparsity) + + +TTypeEnum = TypeVar('TTypeEnum', bound=Enum) +class TypedTESIDCoder(Generic[TTypeEnum]): + """ + A TESID coder with type discrimination baked in. + + >>> from tesid import TypedTESIDCoder, TESIDCoder + >>> from enum import Enum + >>> class Type(Enum): + ... A = 0 + ... B = 1 + ... C = 2 + >>> coder = TypedTESIDCoder(TESIDCoder('000102030405060708090a0b0c0d0e0f'), 256, Type) + + (Methods’ examples start with this foundation.) + """ + + def __init__(self, coder: TESIDCoder, sparsity: int, type_enum: type[TTypeEnum]): + """ + Initialise a typed TESID coder. + + This takes a ``TESIDCoder`` (rather than a key) so that you can share a + coder, if you don’t always use the one sparsity and type enum. + + ``sparsity`` must exceed the highest variant in ``type_enum``. + """ + self.coder = coder + self.sparsity = sparsity + self.type = type_enum + + def __repr__(self): + return f'TypedTESIDCoder(coder={self.coder!r}, sparsity={self.sparsity!r}, type={self.type!r})' + + def encode(self, type: TTypeEnum, id: int): + """ + Encode an ID and type. + + >>> coder.encode(Type.A, 0) + 'w2ej' + >>> coder.encode(Type.B, 0) + 'w6um' + >>> coder.encode(Type.A, 1) + 'dh2h' + + """ + return self.coder.encode(id, sparsity=self.sparsity, discriminant=type.value) + + def decode(self, type: TTypeEnum, tesid: str): + """ + Decode an ID and type. + + >>> coder.decode(Type.A, 'w2ej') + 0 + >>> coder.decode(Type.B, 'w6um') + 0 + >>> coder.decode(Type.A, 'dh2h') + 1 + >>> coder.decode(Type.A, 'w6um') + Traceback (most recent call last): + ... + ValueError: invalid TESID (wrong discriminant or sparsity) + + """ + return self.coder.decode(tesid, sparsity=self.sparsity, discriminant=type.value) + + def split_decode(self, tesid: str) -> SplitDecode[TTypeEnum]: + """ + Decode an ID but separate and return its discriminant too. + + >>> coder.split_decode('w2ej') + SplitDecode(id=0, discriminant=) + >>> coder.split_decode('w6um') + SplitDecode(id=0, discriminant=) + >>> coder.split_decode('x45g') + SplitDecode(id=0, discriminant=) + >>> coder.split_decode('dh2h') + SplitDecode(id=1, discriminant=) + >>> coder.split_decode('a6xy') + SplitDecode(id=1, discriminant=) + >>> _.id + 1 + >>> coder.split_decode('7xgj') + SplitDecode(id=1, discriminant=) + >>> _.discriminant == Type.C + True + >>> coder.split_decode('6mqv') # id=0, discriminant=3 + Traceback (most recent call last): + ... + ValueError: 3 is not a valid Type + + """ + with_int = self.coder.split_decode(tesid, sparsity=self.sparsity) + with_typed = cast(SplitDecode[TTypeEnum], with_int) + with_typed.discriminant = self.type(with_int.discriminant) + return with_typed diff --git a/python/src/tesid/base32.py b/python/src/tesid/base32.py new file mode 100644 index 00000000..e3036ed --- /dev/null +++ b/python/src/tesid/base32.py @@ -0,0 +1,37 @@ +__all__ = ['encode', 'decode'] + +ALPHABET = '23456789abcdefghijkmnpqrstuvwxyz' + +ALPHABET_INVERSE = [ + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, -1, 19, 20, -1, + 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, +] + + +def encode(input: int, pad_to_length: int) -> str: + digits = [] + while input: + digits.append(ALPHABET[input & 31]) + input >>= 5 + while len(digits) < pad_to_length: + digits.append(ALPHABET[0]) + return ''.join(digits[::-1]) + + +def decode(input: str) -> int | None: + out = 0 + for c in input: + try: + n = ALPHABET_INVERSE[ord(c)] + except IndexError: + return None + if n == -1: + return None + out = (out << 5) | n + return out diff --git a/python/src/tesid/fpeck.py b/python/src/tesid/fpeck.py new file mode 100644 index 00000000..69f2c22 --- /dev/null +++ b/python/src/tesid/fpeck.py @@ -0,0 +1,53 @@ +import math +from typing import List + + +__all__ = ['expand', 'encrypt', 'decrypt'] + + +# I’ve used List[int], but it amuses me to contemplate Tuple[(int,) * 30] too. + +def rotate_left(lhs: int, rhs: int, bits: int) -> int: + return (lhs << rhs) | (lhs >> (bits - rhs)) + + +def rotate_right(lhs: int, rhs: int, bits: int) -> int: + return (lhs << (bits - rhs)) | (lhs >> rhs) + + +def expand(key: int) -> List[int]: + mask = 2**64 - 1 + y = key & mask + k = [y] + x = (key >> 64) & mask + + for i in range(0, 29): + x = ((rotate_right(x, 9, 64) + y) ^ i) & mask + y = (rotate_left(y, 2, 64) & mask) ^ x + k.append(y) + + return k + + +def encrypt(k: List[int], n: int, plain_text: int) -> int: + mask = 2**n - 1 + x = plain_text & mask + y = (plain_text >> n) & mask + + for ki in k: + x = ((rotate_right(x, 9, n) + y) ^ ki) & mask + y = (rotate_left(y, 2, n) ^ x) & mask + + return (y << n) | x + + +def decrypt(k: List[int], n: int, cipher_text: int) -> int: + mask = 2**n - 1 + x = cipher_text & mask + y = (cipher_text >> n) & mask + + for ki in k[::-1]: + y = rotate_right(x ^ y, 2, n) & mask; + x = rotate_left(((x ^ ki) - y) & mask, 9, n) & mask; + + return (y << n) | x diff --git a/python/src/tesid/test/__init__.py b/python/src/tesid/test/__init__.py new file mode 100644 index 00000000..0703fd3 --- /dev/null +++ b/python/src/tesid/test/__init__.py @@ -0,0 +1,14 @@ +from .base32 import * +from .fpeck import * +from .tesid import * +import unittest +import doctest + +def load_tests(loader, tests, ignore): + tests.addTests( + doctest.DocFileSuite('../__init__.py', optionflags=doctest.ELLIPSIS) + ) + return tests + +if __name__ == '__main__': + unittest.main() diff --git a/python/src/tesid/test/base32.py b/python/src/tesid/test/base32.py new file mode 100644 index 00000000..73fe01d --- /dev/null +++ b/python/src/tesid/test/base32.py @@ -0,0 +1,44 @@ +from .. import base32 +import unittest + +class Base32Tests(unittest.TestCase): + def tests(self): + def c(number: int, pad_to_length: int, encoded: str): + self.assertEqual(base32.encode(number, pad_to_length), encoded) + self.assertEqual(base32.decode(encoded), number) + + B32_345678 = (1 * 32**5 + + 2 * 32**4 + + 3 * 32**3 + + 4 * 32**2 + + 5 * 32**1 + + 6 * 32**0) + + B32_base32 = ( 9 * 32**5 + + 8 * 32**4 + + 24 * 32**3 + + 12 * 32**2 + + 1 * 32**1 + + 0 * 32**0) + + c( 0, 0, ''); + c( 0, 1, '2'); + c( 0, 7, '2222222'); + c( 1, 0, '3'); + c( 1, 1, '3'); + c( 1, 20, '22222222222222222223'); + c( 32, 0, '32'); + c( 32, 1, '32'); + c( 32, 2, '32'); + c( 32, 3, '232'); + c(B32_345678, 0, '345678'); + c(B32_base32, 0, 'base32'); + c(B32_345678, 20, '22222222222222345678'); + c(B32_base32, 20, '22222222222222base32'); + c(2**100 - 1, 0, 'zzzzzzzzzzzzzzzzzzzz') + + # This implementation supports arbitrary length, no truncation. + c(1459980823972598128486511383358617792788444579872, 0, + 'zyxwvutsrqpnmkjihgfedcba98765432') + c(0, 1000, '2' * 1000) + c(32**1000 - 1, 0, 'z' * 1000) diff --git a/python/src/tesid/test/fpeck.py b/python/src/tesid/test/fpeck.py new file mode 100644 index 00000000..511536e --- /dev/null +++ b/python/src/tesid/test/fpeck.py @@ -0,0 +1,16 @@ +from .. import fpeck +import unittest + +class FpeckTests(unittest.TestCase): + def test_5peck_n_10(self): + k = fpeck.expand(0x000102030405060708090a0b0c0d0e0f) + self.assertEqual(fpeck.encrypt(k, 10, 0), 917905) + self.assertEqual(fpeck.decrypt(k, 10, 917905), 0) + self.assertEqual(fpeck.encrypt(k, 10, 0b1111111111), 314126) + self.assertEqual(fpeck.decrypt(k, 10, 314126), 0b1111111111) + # Look, anything else can be covered by the TESID tests. + # They cover at least two values from each 5peck instance. + + +if __name__ == '__main__': + unittest.main() diff --git a/python/src/tesid/test/tesid.py b/python/src/tesid/test/tesid.py new file mode 100644 index 00000000..fc334d1 --- /dev/null +++ b/python/src/tesid/test/tesid.py @@ -0,0 +1,118 @@ +import unittest + +from .. import TESIDCoder + + +class TESIDTests(unittest.TestCase): + def test_tesids(self): + def c(coder: TESIDCoder, number: int, string: str): + self.assertEqual(coder.encode(number), string) + self.assertEqual(coder.decode(string), number) + + coder = TESIDCoder('00000000000000000000000000000000') + c(coder, 0 , '4kcc') + c(coder, 2**20 - 1 , '3rck') + c(coder, 2**20 , 'ju2sgs') + c(coder, 2**30 - 1 , 'zangyh') + c(coder, 2**30 , '2aux4u3h') + c(coder, 2**40 - 1 , '3cd7rc4h') + c(coder, 2**40 , 'm8669y33k6') + c(coder, 2**50 - 1 , '45e9rbrvvu') + c(coder, 2**50 , 't47yf553iv8t') + c(coder, 2**60 - 1 , 'cwd8t75epzje') + c(coder, 2**60 , '86hk4d8hj4yvcy') + c(coder, 2**64 - 1 , 'sirnf2k2d2m3bm') + c(coder, 2**70 - 1 , 'm77g4ezr3e8qay') + c(coder, 2**70 , '43xf2jj6r6qm8bw4') + c(coder, 2**80 - 1 , '6h3wb7wytjr5tbrd') + c(coder, 2**80 , '4vumjq33d8iiwaharq') + c(coder, 2**90 - 1 , 'qd7s3csnc5yfrrud5t') + c(coder, 2**90 , 'jd3vsipfn69ia72chuvx') + c(coder, 2**100 - 1, '628fg5kyid3z2vf2j4tf') + + coder = TESIDCoder('000102030405060708090a0b0c0d0e0f') + c(coder, 0 , 'w2ej') + c(coder, 2**20 - 1 , 'atcw') + c(coder, 2**20 , '8qwm6y') + c(coder, 2**30 - 1 , '3eipc7') + c(coder, 2**30 , 'n3md95r4') + c(coder, 2**40 - 1 , 'nnz4z5qb') + c(coder, 2**40 , 'st9fvria97') + c(coder, 2**50 - 1 , 'qt42fug7hq') + c(coder, 2**50 , 'dykqxtu2ieqi') + c(coder, 2**60 - 1 , 'h7rhnw6tfhun') + c(coder, 2**60 , 'xb5c8isevin9i3') + c(coder, 2**64 - 1 , 't62mijffzuvu4e') + c(coder, 2**70 - 1 , 'n6n8jq6ike9dnj') + c(coder, 2**70 , 'zk9d3setywjf7uwu') + c(coder, 2**80 - 1 , 'bqqei5vmzkqjfru3') + c(coder, 2**80 , 'z83vvq5u84sit9g7pd') + c(coder, 2**90 - 1 , 'cpawgn8snjvverxvmp') + c(coder, 2**90 , '397btwmkh5y7sjz2xu82') + c(coder, 2**100 - 1, 'ia2bvpjaiju7g5uaxn5t') + + with self.assertRaises(ValueError): coder.encode(-1) + with self.assertRaises(TypeError): coder.encode(0.5) + with self.assertRaises(ValueError): coder.encode(2**100) + + # Test misencodings: 0 is w2ej, but if 0 were encrypted with n=15 + # instead of n=10, it’d be m2eig5—so that value isn’t allowed. + assert_overlong = self.assertRaises(ValueError, msg='invalid TESID (overly long encoding)') + with assert_overlong: coder.decode('m2eig5') + + # … but slightly changed values are probably valid (since only one in + # 2¹⁰ is invalid). + self.assertEqual(coder.decode('m2eig6'), 473063752) + + # Also a few more at the boundaries for confidence: + # 2²⁰−1 but encoded with n=15 instead of n=10 + with assert_overlong: coder.decode('vf5fem') + # 2³⁰−1 but encoded with n=20 instead of n=15 + with assert_overlong: coder.decode('ixs6h9ma') + # 2³⁰−1 but encoded with n=50 instead of n=10 + with assert_overlong: coder.decode('uhkprgrirp45pe54twsa') + + x = self.assertRaises(ValueError, msg='invalid TESID (wrong length)') + with x: coder.decode('') + with x: coder.decode('2') + with x: coder.decode('22') + with x: coder.decode('222') + with x: coder.decode('22222') + with x: coder.decode('2222222222222222222') + with x: coder.decode('222222222222222222222') + + # … but just so it’s clear, ones are fine, it was just the lengths that were wrong. + c(coder, 173734, '2222') + c(coder, 592178178, '222222') + c(coder, 111515659577240532774228475483, '22222222222222222222') + + # Now time for some tagging. + def c2(coder: TESIDCoder, sparsity: int, discriminant: int, split, id: int, tesid: str): + self.assertEqual(coder.encode(id, sparsity=sparsity, discriminant=discriminant), tesid) + self.assertEqual(coder.decode(tesid, sparsity=sparsity, discriminant=discriminant), id) + + c2(coder, sparsity=1, discriminant=0, split=self.assertEqual, id=0, tesid='w2ej') + c2(coder, sparsity=1, discriminant=1, split=self.assertNotEqual, id=0, tesid='w6um') + c2(coder, sparsity=1, discriminant=2, split=self.assertNotEqual, id=0, tesid='x45g') + + c2(coder, sparsity=100, discriminant=0, split=self.assertEqual, id=0, tesid='w2ej') + c2(coder, sparsity=100, discriminant=1, split=self.assertEqual, id=0, tesid='w6um') + c2(coder, sparsity=100, discriminant=2, split=self.assertEqual, id=0, tesid='x45g') + c2(coder, sparsity=100, discriminant=0, split=self.assertEqual, id=1, tesid='ypbn') + c2(coder, sparsity=100, discriminant=1, split=self.assertEqual, id=1, tesid='k9pw') + c2(coder, sparsity=100, discriminant=2, split=self.assertEqual, id=1, tesid='b7nc') + c2(coder, sparsity=100, discriminant=0, split=self.assertEqual, id=2, tesid='r9yc') + c2(coder, sparsity=100, discriminant=1, split=self.assertEqual, id=2, tesid='arf2') + c2(coder, sparsity=100, discriminant=2, split=self.assertEqual, id=2, tesid='z6wh') + c(coder, 0, 'w2ej') + c(coder, 1, 'w6um') + c(coder, 2, 'x45g') + c(coder, 100, 'ypbn') + c(coder, 101, 'k9pw') + c(coder, 102, 'b7nc') + c(coder, 200, 'r9yc') + c(coder, 201, 'arf2') + c(coder, 202, 'z6wh') + # The highest sparsity that’s always valid given 64-bit discriminant and id (which is the case in the Rust implementation): 2³⁶ − 1 + c2(coder, sparsity=(1<<36) - 1, discriminant=2**64 - 1, split=self.assertNotEqual, id=2**64 - 1, tesid='fjwz5jk3p4gz9aqes22e') + c(coder, 1267650600228229401427983728640, 'fjwz5jk3p4gz9aqes22e') diff --git a/rust/CHANGELOG.md b/rust/CHANGELOG.md new file mode 100644 index 00000000..8e8eebc --- /dev/null +++ b/rust/CHANGELOG.md @@ -0,0 +1,7 @@ +Changelog +========= + +1.0.0 (2022-07-14) +------------------ + +Initial release. Implements TESID 1.0.0. diff --git a/rust/COPYRIGHT b/rust/COPYRIGHT new file mode 100644 index 00000000..2cced42 --- /dev/null +++ b/rust/COPYRIGHT @@ -0,0 +1,17 @@ +Copyright © 2022 Chris Morgan + +This project is distributed under the terms of three different licenses, +at your choice: + +- Blue Oak Model License 1.0.0: https://blueoakcouncil.org/license/1.0.0 +- MIT License: https://opensource.org/licenses/MIT +- Apache License, Version 2.0: https://www.apache.org/licenses/LICENSE-2.0 + +If you do not have particular cause to select the MIT or the Apache-2.0 +license, Chris Morgan recommends that you select BlueOak-1.0.0, which is +better and simpler than both MIT and Apache-2.0, which are only offered +due to their greater recognition and their conventional use in the Rust +ecosystem. (BlueOak-1.0.0 was only published in March 2019.) + +When using this code, ensure you comply with the terms of at least one of +these licenses. diff --git a/rust/Cargo.toml b/rust/Cargo.toml new file mode 100644 index 00000000..b101902 --- /dev/null +++ b/rust/Cargo.toml @@ -0,0 +1,19 @@ +[package] +name = "tesid" +description = "TESID: textualised encrypted sequential identifiers" +authors = ["Chris Morgan "] +license = "BlueOak-1.0.0 OR MIT OR Apache-2.0" +version = "1.0.0" +edition = "2021" +rust-version = "1.56" +keywords = ["id", "autoincrement"] +categories = ["no-std", "encoding", "value-formatting"] +homepage = "https://chrismorgan.info/tesid/" +repository = "https://git.chrismorgan.info/tesid" + +[package.metadata.docs.rs] +all-features = true +rustdoc-args = ["--cfg", "docsrs"] + +[features] +std = [] diff --git a/rust/README.md b/rust/README.md new file mode 100644 index 00000000..bb7d8cf --- /dev/null +++ b/rust/README.md @@ -0,0 +1,73 @@ +# tesid: Rust crate for Textualised Encrypted Sequential Identifiers + +This crate implements the TESID scheme for producing short, human-friendly, +cryptographically-secure pseudorandom strings out of sequential identifiers. + +See for more information about TESID. + +A simple demonstration: + +```rust +let secret_key = "000102030405060708090a0b0c0d0e0f"; +let coder = tesid::TesidCoder::new(secret_key).unwrap(); +assert_eq!(&*coder.encode(0), "w2ej"); +assert_eq!(&*coder.encode(1), "w6um"); +assert_eq!(&*coder.encode(2), "x45g"); +assert_eq!(&*coder.encode(2_u64.pow(20) - 1), "atcw"); +assert_eq!(&*coder.encode(2_u64.pow(20)), "8qwm6y"); +assert_eq!(&*coder.encode(2_u64.pow(30) - 1), "3eipc7"); +assert_eq!(&*coder.encode(2_u64.pow(30)), "n3md95r4"); +assert_eq!(&*coder.encode_long(2_u128.pow(100) - 1).unwrap(), "ia2bvpjaiju7g5uaxn5t"); +assert_eq!(coder.encode_long(2_u128.pow(100)), Err(tesid::ValueTooLarge)); + +assert_eq!(coder.decode("w2ej"), Ok(0)); +``` + +And with the most convenient form of type discrimination: +(You can also do all this through TesidCoder, but it’s less convenient.) + +```rust +use tesid::{ + TesidCoder, TypedTesidCoder, DecodeError, SplitDecode, + define_type_discriminant, +}; +define_type_discriminant!(sparsity=256, + #[non_exhaustive] + #[derive(Debug, PartialEq, Eq)] + pub enum Type { + A = 0, + B = 1, + C = 2, + } +); +let secret_key = "000102030405060708090a0b0c0d0e0f"; +let coder = TesidCoder::new(secret_key).unwrap(); +let typed_coder = TypedTesidCoder::::new(coder); + +assert_eq!(&*typed_coder.encode(Type::A, 0), "w2ej"); +assert_eq!(&*typed_coder.encode(Type::B, 0), "w6um"); +assert_eq!(&*typed_coder.encode(Type::C, 0), "x45g"); +assert_eq!(&*typed_coder.encode(Type::A, 1), "dh2h"); +assert_eq!(&*typed_coder.encode(Type::B, 1), "a6xy"); +assert_eq!(&*typed_coder.encode(Type::C, 1), "7xgj"); +assert_eq!(typed_coder.decode(Type::A, "w2ej"), Ok(0)); +assert_eq!(typed_coder.decode(Type::B, "w2ej"), + Err(DecodeError::WrongDiscriminant { id: 0, discriminant: Type::A })); +assert_eq!(typed_coder.split_decode("w2ej"), + Ok(SplitDecode { id: 0, discriminant: Type::A })); +``` + +## Cargo features + +- **std**: currently only `impl std::error::Error for tesid::{DecodeError, ValueTooLarge}`. + *Not* enabled by default. + +## Colophon + +**Authorship:** [Chris Morgan](https://chrismorgan.info/) is the author and maintainer of this library. + +**Licensing:** this library is distributed under the terms of the +[Blue Oak Model License 1.0.0](https://blueoakcouncil.org/license/1.0.0), the +[MIT License](https://opensource.org/licenses/MIT) and the +[Apache License, Version 2.0](https://www.apache.org/licenses/LICENSE-2.0), at your choice. +See [COPYRIGHT](COPYRIGHT) for details. diff --git a/rust/examples/rejection_rates.rs b/rust/examples/rejection_rates.rs new file mode 100644 index 00000000..b65a5d2 --- /dev/null +++ b/rust/examples/rejection_rates.rs @@ -0,0 +1,98 @@ +//! Estimate (with the Monte Carlo method) at what rate IDs would be rejected, using various +//! bad-pattern rules. + +fn main() { + let coder = tesid::TesidCoder::new("000102030405060708090a0b0c0d0e0f").unwrap(); + + let mut i64_total = 0; + let mut i64_passed = 0; + let mut i64_repeat2 = 0; + let mut i64_repeat3 = 0; + let mut i64_letters3 = 0; + let mut i64_letters4 = 0; + + println!("┌────────┬────────┬─────────┬─────────┬──────────┬──────────┐"); + println!("│ Length │ Pass │ repeat2 │ repeat3 │ 3letters │ 4letters │"); + println!("┝━━━━━━━━┿━━━━━━━━┿━━━━━━━━━┿━━━━━━━━━┿━━━━━━━━━━┿━━━━━━━━━━┥"); + for j in 1..10 { + let range_low = if j == 1 { 0 } else { 1 << (j * 10) }; + let range_high = 1 << ((j + 1) * 10); + let sample_start = if j == 1 { + range_low + } else { + range_low // << 9 + // ↑ This is an opportunity to sample from a different region if you really want to, + // though because of the cipher, values are uniform so that you won’t see much change, + // less than 0.1 percentage points. You can go as high as `range_low << 9 - (2 << 20)`. + // (j == 1, the 4-character range, is excluded as we’re testing its entire range. + }; + + let sample_end = sample_start + (1 << 20); + + let mut total = 0; + let mut passed = 0; + let mut repeat2 = 0; + let mut repeat3 = 0; + let mut letters3 = 0; + let mut letters4 = 0; + + let len = coder.encode_long(sample_start).unwrap().len(); + assert_eq!(coder.encode_long(sample_end - 1).unwrap().len(), len, + "you messed things up so that the sampling range isn’t entirely of one length"); + for i in sample_start..sample_end { + let s = coder.encode_long(i).unwrap(); + let s = s.as_bytes(); + let mut pass = true; + if s.windows(2).any(|c| c[0] == c[1]) { + repeat2 += 1; + pass = false; + } + if s.windows(3).any(|c| c[0] == c[1] && c[1] == c[2]) { + repeat3 += 1; + pass = false; + } + if s.windows(3).any(|digits| digits.iter().all(|c| c.is_ascii_alphabetic())) { + letters3 += 1; + pass = false; + } + if s.windows(4).any(|digits| digits.iter().all(|c| c.is_ascii_alphabetic())) { + letters4 += 1; + pass = false; + } + if pass { + passed += 1; + } + total += 1; + } + println!("│ {len:2} │ {:5.2}% │ {:5.2}% │ {:5.2}% │ {:5.2}% │ {:5.2}% │", + 100.0 * passed as f64 / total as f64, + 100.0 * repeat2 as f64 / total as f64, // the same character twice in a row + 100.0 * repeat3 as f64 / total as f64, // the same character thrice in a row + 100.0 * letters3 as f64 / total as f64, // three letters in a row + 100.0 * letters4 as f64 / total as f64, // four letters in a row + ); + + let i64s_in_range = if range_low > 1 << 63 { + 0 + } else { + range_high.min(1 << 63) - range_low + }; + i64_total += i64s_in_range; + i64_passed += (i64s_in_range as f64 * passed as f64 / total as f64) as u64; + i64_repeat2 += (i64s_in_range as f64 * repeat2 as f64 / total as f64) as u64; + i64_repeat3 += (i64s_in_range as f64 * repeat3 as f64 / total as f64) as u64; + i64_letters3 += (i64s_in_range as f64 * letters3 as f64 / total as f64) as u64; + i64_letters4 += (i64s_in_range as f64 * letters4 as f64 / total as f64) as u64; + } + println!("└────────┴────────┴─────────┴─────────┴──────────┴──────────┘"); + + println!("i64 (as commonly used by SQL databases) statistics, given sparsity=1, discriminant=0:"); + assert_eq!(i64_total, 1 << 63); // Sanity check 🙂 + println!(" • Total (=2⁶³) : {i64_total:19}"); + println!(" • Passed all : {i64_passed:19} ({:5.2}%)", 100.0 * i64_passed as f64 / i64_total as f64); + println!(" • Fail repeat2 : {i64_repeat2:19} ({:5.2}%)", 100.0 * i64_repeat2 as f64 / i64_total as f64); + println!(" • Fail repeat3 : {i64_repeat3:19} ({:5.2}%)", 100.0 * i64_repeat3 as f64 / i64_total as f64); + println!(" • Fail letters3 : {i64_letters3:19} ({:5.2}%)", 100.0 * i64_letters3 as f64 / i64_total as f64); + println!(" • Fail letters4 : {i64_letters4:19} ({:5.2}%)", 100.0 * i64_letters4 as f64 / i64_total as f64); + println!("(Most of the TESIDs fall in the 14-character range. With higher sparsity, you could get into the higher echelons of long IDs and new peaks of fail rates, but hopefully you can see that even if you reject 99% of IDs, there are still rather a lot left!)"); +} diff --git a/rust/src/base32.rs b/rust/src/base32.rs new file mode 100644 index 00000000..9057627 --- /dev/null +++ b/rust/src/base32.rs @@ -0,0 +1,169 @@ +//! Specialised small-value base-32 encoding and decoding. +//! +//! This implementation deals with values in the range [0, 2¹⁰⁰), which means up to 20 base-32 +//! characters (log₃₂ 2¹⁰⁰ = 20), because that’s the upper limit I’ve chosen for TESID. +//! If you want a general-purpose base-32 library for longer values, the base-x crate looks good. +//! But we get better performance by applying an upper bound consistent with the 5peck usage. + +// Digits and letters, minus 0, 1, l, o. +const ALPHABET: [u8; 32] = *b"23456789abcdefghijkmnpqrstuvwxyz"; + +// (I wish there was a real u7 so I could do Option instead of i8 with sentinel -1. +// NonZeroU8 is insufficient since I need zero. Kinda want NonMaxU8 instead.) +const ALPHABET_INVERSE: [i8; 256] = [ + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, -1, 19, 20, -1, + 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, +]; + +#[test] +fn check_base32_alphabet_tables_are_correct() { + for (i, &n) in ALPHABET_INVERSE.iter().enumerate() { + match n { + -128..=-2 | 32..=127 => unreachable!(), + -1 => (), + 0..=31 => assert_eq!(i as u8, ALPHABET[n as usize]), + } + } + for (i, &c) in ALPHABET.iter().enumerate() { + // SAFETY proof here, that ALPHABET members are all ASCII. (Though really, if you don’t + // trust simple observation of ALPHABET, I dunno why you’d trust this line! 😀 But I guess + // it’s in the test, which means something.) + assert!(c.is_ascii()); + assert_eq!(ALPHABET_INVERSE[c as usize] as usize, i); + } +} + +/// Base-32-encode a u128 that’s below 2¹⁰⁰. +/// The output is ASCII right-aligned with leading zero padding. +/// If the input is 2¹⁰⁰ or higher, bits will be lost. +pub const fn encode(input: u128) -> [u8; 20] { + // Observe the unrolled loop. A regular loop is, *for a given length*, somewhere between the + // same speed (small values) and several times slower (large values). In this library we only + // need 20-character output, so unrolled is mildly preferable. + [ + ALPHABET[((input >> 95) & 31) as usize], + ALPHABET[((input >> 90) & 31) as usize], + ALPHABET[((input >> 85) & 31) as usize], + ALPHABET[((input >> 80) & 31) as usize], + ALPHABET[((input >> 75) & 31) as usize], + ALPHABET[((input >> 70) & 31) as usize], + ALPHABET[((input >> 65) & 31) as usize], + ALPHABET[((input >> 60) & 31) as usize], + ALPHABET[((input >> 55) & 31) as usize], + ALPHABET[((input >> 50) & 31) as usize], + ALPHABET[((input >> 45) & 31) as usize], + ALPHABET[((input >> 40) & 31) as usize], + ALPHABET[((input >> 35) & 31) as usize], + ALPHABET[((input >> 30) & 31) as usize], + ALPHABET[((input >> 25) & 31) as usize], + ALPHABET[((input >> 20) & 31) as usize], + ALPHABET[((input >> 15) & 31) as usize], + ALPHABET[((input >> 10) & 31) as usize], + ALPHABET[((input >> 5) & 31) as usize], + ALPHABET[((input >> 0) & 31) as usize], + ] +} + +/// Base-32-decode a string. No sanity checking is performed on length, nor is overflow checked. +/// Strings below 26 characters long will succeed. +pub const fn decode(input: &str) -> Result { + // Unlike encode, decode is not well-adapted to loop-unrolling. + // My simple attempts both took three times as long. + let mut out = 0; + // No `for b in input.as_bytes()` at this time because const fn. + let input = input.as_bytes(); + let mut i = 0; + while i < input.len() { + out = (out << 5) | match ALPHABET_INVERSE[input[i] as usize] { + -128..=-1 => return Err(()), + n @ 0.. => n as u128, + }; + i += 1; + } + Ok(out) +} + +#[cfg(bench)] +#[bench] +fn bench_encode_short(b: &mut test::Bencher) { + // Encode all the numbers from zero to four characters wide (up to about a million). + b.iter(|| for i in 0..(1 << 20) { + encode(test::black_box(i)); + }); +} + +#[cfg(bench)] +#[bench] +fn bench_encode_long(b: &mut test::Bencher) { + // Encode around a million (same number as the short benchmarks) 20-character values. + // Should take much the same time as bench_encode_short. + b.iter(|| for i in (1 << 99)..((1 << 99) + (1 << 20)) { + encode(test::black_box(i)); + }); +} + +#[cfg(bench)] +#[bench] +fn bench_encode_and_decode_short(b: &mut test::Bencher) { + b.iter(|| for i in 0..(1 << 20) { + decode(unsafe { core::str::from_utf8_unchecked(&encode(test::black_box(i))[16..]) }).unwrap(); + }); +} + +#[cfg(bench)] +#[bench] +fn bench_encode_and_decode_long(b: &mut test::Bencher) { + b.iter(|| for i in (1 << 99)..((1 << 99) + (1 << 20)) { + decode(unsafe { core::str::from_utf8_unchecked(&encode(test::black_box(i))) }).unwrap(); + }); +} + +#[test] +fn test() { + const B32_345678: u128 = 1 * 32_u128.pow(5) + + 2 * 32_u128.pow(4) + + 3 * 32_u128.pow(3) + + 4 * 32_u128.pow(2) + + 5 * 32_u128.pow(1) + + 6 * 32_u128.pow(0); + + #[allow(non_upper_case_globals)] + const B32_base32: u128 = 9 * 32_u128.pow(5) + + 8 * 32_u128.pow(4) + + 24 * 32_u128.pow(3) + + 12 * 32_u128.pow(2) + + 1 * 32_u128.pow(1) + + 0 * 32_u128.pow(0); + + assert_eq!(encode(0), *b"22222222222222222222"); + assert_eq!(decode(""), Ok(0)); + assert_eq!(decode("2"), Ok(0)); + assert_eq!(encode(1), *b"22222222222222222223"); + assert_eq!(decode("3"), Ok(1)); + assert_eq!(encode(32), *b"22222222222222222232"); + assert_eq!(decode("32"), Ok(32)); + assert_eq!(encode(B32_345678), *b"22222222222222345678"); + assert_eq!(decode("345678"), Ok(B32_345678)); + assert_eq!(encode(B32_base32), *b"22222222222222base32"); + assert_eq!(decode("base32"), Ok(B32_base32)); + assert_eq!(encode(u128::MAX), *b"zzzzzzzzzzzzzzzzzzzz"); // truncated at 120 bits + + // Decoding overly long values leads to truncation at 2¹²⁸. + assert_eq!(decode("zyxwvutsrqpnmkjihgfedcba98765432"), Ok(75421586008491043410716744415157782560)); + // Encoding overly long values leads to truncation at 2¹²⁰. + assert_eq!(encode(75421586008491043410716744415157782560), *b"mkjihgfedcba98765432"); +} diff --git a/rust/src/fpeck.rs b/rust/src/fpeck.rs new file mode 100644 index 00000000..0cbab5d --- /dev/null +++ b/rust/src/fpeck.rs @@ -0,0 +1,122 @@ +//! 5peck: instantiations of the Speck family of ciphers with weird word sizes and key handling. +//! +//! These are NOT suitable for general cryptography. They are designed for this particular +//! application and nothing else. The smaller block sizes especially would certainly be unsuitable +//! for their vulnerability to birthday attacks when used in typical cryptographic modes. +//! But those vulnerabilities don’t apply to TESID which doesn’t use such modes. +//! +//! These are NOT dependable. Speck’s parameters were chosen by cryptographers based on deep +//! understanding, real cryptanalysis, and simulation. Meanwhile, I wrote 5peck, non-cryptographer +//! that I am and not *particularly* knowledgeable in the field (especially about cryptanalysis). +//! I have taken guidance from their [original paper](https://eprint.iacr.org/2013/404.pdf) and +//! [subsequent design notes](https://eprint.iacr.org/2017/560.pdf), but I’ve had to deviate quite +//! significantly from their design in some elements, most significantly in the treatment of the +//! key for small word sizes. The parameters I’ve settled on are largely based on interpolation +//! and a little extrapolation, with only some very basic cryptanalytic simulations; but I believe +//! what I have should hold up well with the best of ’em. +//! +//! The main two alternatives that I could think of were: Hasty Pudding, which with its bit-level +//! blocksize flexibility would work just fine but is *considerably* more complex, slower, and more +//! shallowly studied; and something like NIST’s FF1 or FF3-1 ciphers, which are patent-encumbered +//! which makes me stay right away from them. There are surprisingly few practical options for this +//! sort of thing that I’m doing, really. + +const fn mask(word_bits: u32) -> u64 { + 2_u64.pow(word_bits) - 1 +} + +/// Rotate a *bits*-bit integer left, WITHOUT masking or checking rhs < bits. +/// (That is, the input must be masked and masking the output is your responsibility.) +const fn rotate_left(lhs: u64, rhs: u32, bits: u32) -> u64 { + (lhs << rhs) | (lhs >> (bits - rhs)) +} + +/// Rotate a *bits*-bit integer right, WITHOUT masking or checking rhs < bits. +/// (That is, the input must be masked and masking the output is your responsibility.) +const fn rotate_right(lhs: u64, rhs: u32, bits: u32) -> u64 { + (lhs << (bits - rhs)) | (lhs >> rhs) +} + +pub fn expand(key: u128) -> [u64; 30] { + let mut k = [0; 30]; + let mut y = key as u64; + k[0] = y; + let mut x = (key >> 64) as u64; // m=2 means ℓ can be a single value rather than an array + for i in 0..29 { + x = x.rotate_right(9).wrapping_add(y) ^ (i as u64); + y = y.rotate_left(2) ^ x; + k[i + 1] = y; + } + k +} + +// On endianness: I’ve followed Speck and gone little-endian. As I’m only encrypting numbers, I +// take a u128 rather than messing around with [u8; 16] or such. The least significant bits are +// taken as x (the first word), the most significant as y (the second word). + +pub fn encrypt(k: &[u64; 30], n: u32, plain_text: u128) -> u128 { + debug_assert_eq!(plain_text >> (2 * n), 0); + let mask = mask(n); + let mut x = (plain_text as u64) & mask; + let mut y = ((plain_text >> n) as u64) & mask; + + for &k in k { + x = (rotate_right(x, 9, n).wrapping_add(y) ^ k) & mask; + y = (rotate_left(y, 2, n) ^ x) & mask; + } + + ((y as u128) << n) | (x as u128) +} + +pub fn decrypt(k: &[u64; 30], n: u32, cipher_text: u128) -> u128 { + debug_assert_eq!(cipher_text >> (2 * n), 0); + let mask = mask(n); + let mut x = (cipher_text as u64) & mask; + let mut y = ((cipher_text >> n) as u64) & mask; + + for &k in k.iter().rev() { + y = rotate_right(x ^ y, 2, n) & mask; + x = rotate_left((x ^ k).wrapping_sub(y) & mask, 9, n) & mask; + } + + ((y as u128) << n) | (x as u128) +} + +#[test] +fn test_5peck_n_10() { + let k = expand(0x000102030405060708090a0b0c0d0e0f); + assert_eq!(encrypt(&k, 10, 0), 917905); + assert_eq!(decrypt(&k, 10, 917905), 0); + assert_eq!(encrypt(&k, 10, 0b1111111111), 314126); + assert_eq!(decrypt(&k, 10, 314126), 0b1111111111); + // Look, anything else can be covered by the TESID tests. + // They cover at least two values from each n. +} + +#[cfg(feature = "std")] +#[test] +#[cfg_attr(debug_assertions, ignore)] // Takes a second or two in debug mode (<1ms in release). +fn show_5peck_n_10_is_a_permutation() { + let k = expand(0x000102030405060708090a0b0c0d0e0f); + let mut seen = std::collections::HashSet::new(); + for i in 0..0b100000000000000000000 { + let encrypted = encrypt(&k, 10, i); + assert!(encrypted < 0b100000000000000000000); + assert!(seen.insert(encrypted)); + assert_eq!(decrypt(&k, 10, encrypted), i); + } +} + +#[cfg(bench)] +#[bench] +fn bench_5peck_n_10_all(b: &mut test::Bencher) { + let k = expand(0x000102030405060708090a0b0c0d0e0f); + // Yeah, let’s just encrypt all 2²⁰ values (just over one million); why not? + // Takes a little under one millisecond on my laptop—so each encryption is taking a bit under + // one nanosecond. Of course, if we did other stuff in between, then accessing the expanded key + // might slow things down a bit. + b.iter(|| for i in 0..0b100000000000000000000 { + // Gotta black-box it or the entire thing takes 0ns! 🙂 + encrypt(&k, 10, test::black_box(i)); + }) +} diff --git a/rust/src/inline_string.rs b/rust/src/inline_string.rs new file mode 100644 index 00000000..b0d3ebf --- /dev/null +++ b/rust/src/inline_string.rs @@ -0,0 +1,140 @@ +use core::borrow::Borrow; +use core::cmp::Ordering; +use core::fmt; +use core::hash::{Hash, Hasher}; +use core::ops::Deref; +use core::str; + +/// A string of runtime-determined length that stores its contents inline. +/// +/// This type exists for the sake of performance and avoiding allocation (including no-std +/// operation). It dereferences to [`prim@str`] and also implements most of the useful traits that +/// `str` and/or `String` implement, so you can mostly just treat it as any other string. If you +/// want to explicitly convert it to `&str`, there’s `.as_str()`, and to `String`, `.to_string()` +/// or any of the other similar techniques. +/// +/// From an implementation perspective, this is designed for efficient base-conversion: the biggest +/// consequence of this is that the string is *right*-aligned within its capacity, rather than the +/// usual left. It thus stores a start offset instead of a length. +#[derive(Clone)] +pub struct InlineString { + /// SAFETY requirement: buf[start..CAPACITY] must be valid UTF-8. + pub(crate) buf: [u8; CAPACITY], + /// SAFETY requirement: start <= CAPACITY. + pub(crate) start: usize, +} + +impl InlineString { + /// Access this as an `&str`. + // (Sorry, no const as const_slice_index isn’t yet stable.) + pub fn as_str(&self) -> &str { + // SAFETY: both unchecked calls here correspond to declared invariants. + unsafe { str::from_utf8_unchecked(self.buf.get_unchecked(self.start..)) } + } +} + +impl Deref for InlineString { + type Target = str; + fn deref(&self) -> &str { + self.as_str() + } +} + +impl fmt::Display for InlineString { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + self.as_str().fmt(f) + } +} + +impl fmt::Debug for InlineString { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + self.as_str().fmt(f) + } +} + +impl PartialEq> for InlineString { + fn eq(&self, rhs: &InlineString) -> bool { + self.as_str() == rhs.as_str() + } +} + +impl PartialEq for InlineString { + fn eq(&self, rhs: &str) -> bool { + self.as_str() == rhs + } +} + +impl PartialEq<&str> for InlineString { + fn eq(&self, rhs: &&str) -> bool { + self.as_str() == *rhs + } +} + +impl Eq for InlineString {} + +impl AsRef for InlineString { + fn as_ref(&self) -> &str { + self.as_str() + } +} + +impl Borrow for InlineString { + fn borrow(&self) -> &str { + self.as_str() + } +} + +impl Hash for InlineString { + fn hash(&self, state: &mut H) { + self.as_str().hash(state) + } +} + +impl PartialOrd for InlineString { + fn partial_cmp(&self, rhs: &InlineString) -> Option { + self.as_str().partial_cmp(rhs.as_str()) + } +} + +impl Ord for InlineString { + fn cmp(&self, rhs: &InlineString) -> Ordering { + self.as_str().cmp(rhs.as_str()) + } +} + +#[test] +fn test_implementations() { + // SAFETY: all trivially meet the requirements about buf and start. + let s1 = InlineString { buf: *b"He\xE2\x84\x93lo!\n", start: 0 }; + let s2 = InlineString { buf: *b"unusedHe\xE2\x84\x93lo!\n", start: 6 }; + let s3 = InlineString { buf: *b"He\xE2\x84\x93lo?\n", start: 0 }; + // PartialEq, Eq: + assert_eq!(s1, "Heℓlo!\n"); + assert_eq!(s1, s1); + assert_eq!(s1, s2); + assert_ne!(s1, s3); + assert_eq!(s2, "Heℓlo!\n"); + assert_eq!(s2, s1); + assert_eq!(s2, s2); + assert_ne!(s2, s3); + assert_eq!(s3, "Heℓlo?\n"); + assert_ne!(s3, "Heℓlo!\n"); + assert_ne!(s3, s1); + assert_ne!(s3, s2); + assert_eq!(s3, s3); + // Eh, you know what? I’m bored, can’t be bothered writing tests for AsRef, Borrow, Hash, + // PartialOrd, Ord. But they’re all similarly trivial, and PartialEq was really the most + // interesting, with its multiple implementations. +} + +#[test] +#[cfg(feature = "std")] // format! is std. As this is the only test yet that needs std⸺ +fn test_fmt_implementations() { + // SAFETY: all trivially meet the requirements about buf and start. + let s1 = InlineString { buf: *b"He\xE2\x84\x93lo!\n", start: 0 }; + let s2 = InlineString { buf: *b"unusedHe\xE2\x84\x93lo!\n", start: 6 }; + assert_eq!(format!("{s1}"), "Heℓlo!\n"); + assert_eq!(format!("{s1:?}"), "\"Heℓlo!\\n\""); + assert_eq!(format!("{s2}"), "Heℓlo!\n"); + assert_eq!(format!("{s2:?}"), "\"Heℓlo!\\n\""); +} diff --git a/rust/src/lib.rs b/rust/src/lib.rs new file mode 100644 index 00000000..e2e3caf --- /dev/null +++ b/rust/src/lib.rs @@ -0,0 +1,852 @@ +//! TESID: Textualised Encrypted Sequential Identifiers +//! +//! See for a description of what it’s all about, and how to best +//! use it. +//! +//! # Example +//! +//! ```rust +//! let secret_key = "000102030405060708090a0b0c0d0e0f"; +//! let coder = tesid::TesidCoder::new(secret_key).unwrap(); +//! assert_eq!(&*coder.encode(0), "w2ej"); +//! assert_eq!(&*coder.encode(1), "w6um"); +//! assert_eq!(&*coder.encode(2), "x45g"); +//! assert_eq!(&*coder.encode(2_u64.pow(20) - 1), "atcw"); +//! assert_eq!(&*coder.encode(2_u64.pow(20)), "8qwm6y"); +//! assert_eq!(&*coder.encode(2_u64.pow(30) - 1), "3eipc7"); +//! assert_eq!(&*coder.encode(2_u64.pow(30)), "n3md95r4"); +//! assert_eq!(&*coder.encode_long(2_u128.pow(100) - 1).unwrap(), "ia2bvpjaiju7g5uaxn5t"); +//! assert_eq!(coder.encode_long(2_u128.pow(100)), Err(tesid::ValueTooLarge)); +//! +//! assert_eq!(coder.decode("w2ej"), Ok(0)); +//! ``` + +#![cfg_attr(not(feature = "std"), no_std)] +#![cfg_attr(docsrs, feature(doc_cfg))] + +#![allow(clippy::eq_op, clippy::identity_op)] // `20 - 20` and `input >> 0`, good for symmetry. + +// To run benchmarks: `RUSTFLAGS="--cfg bench" cargo +nightly bench` +#![cfg_attr(bench, feature(test))] +#[cfg(bench)] extern crate test; + +use core::fmt; +use core::marker::PhantomData; +use core::ptr; +use core::sync::atomic; + +mod base32; +mod fpeck; +mod inline_string; + +pub use inline_string::InlineString; + +/// The error type when decoding of a TESID failed. +/// +/// Decoding can fail for a number of reasons; most are things you can’t do anything about. +/// `WrongDiscriminant` is the only one that’s likely to be genuinely useful, +/// for enhancing error messages. +/// +/// Because of the typical opaqueness of TESID decode errors, +/// The `fmt::Display` implementation doesn’t try to say anything useful, just “invalid TESID”. +/// +#[cfg_attr(feature = "std", doc = "Since **crate feature `std`** is enabled, this type implements [`std::error::Error`]. +(Though if you’re using `TypedTesidCoder`, you will need to make sure that your type discriminant type implements [`core::fmt::Debug`].)")] +#[cfg_attr(not(feature = "std"), doc = "If you want this type to implement `std::error::Error`, enable **crate feature `std`**.")] +#[derive(Clone, Debug, PartialEq, Eq)] +pub enum DecodeError { + /// The TESID was not 4, 6, 8, 10, 12, 14, 16, 18 or 20 characters long. + BadLength, + + /// The TESID contained characters not in the defined base-32 alphabet. + BadCharacters, + + /// The TESID was an improperly-encoded value, using the wrong length and cipher, + /// e.g. a ten-character string that decodes to 0, but 0 should be four characters. + /// This cannot happen normally and suggests the user is trying to enumerate IDs. + OverlyLongEncoding, + + /// The value was not in the acceptable range for the type, + /// e.g. ≥2⁶⁴ in a u64, or <0 via discriminant. + /// + /// Note that id must be u64 where sparsity or discrimination are used. + OutOfRange, + + /// The TESID did not match the discriminant requested. + /// This assumes that the sparsity was correct. + /// The actual discriminant and ID found are provided. + /// + /// This variant is basically the just-for-enhancing-error-reporting version of + /// [`TesidCoder::split_decode`] or [`TypedTesidCoder::split_decode`], + /// which you should use instead if you want to actively support diverse discriminants. + WrongDiscriminant { + id: u64, + discriminant: Discriminant, + }, + + /// Via [`TypedTesidCoder`] only: the TESID’s discriminant wasn’t an acceptable value. + MalformedDiscriminant { + id: u64, + discriminant: u64, + }, +} + +impl fmt::Display for DecodeError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.write_str("invalid TESID") + } +} + +#[cfg(feature = "std")] +#[cfg_attr(docsrs, doc(cfg(feature = "std")))] +impl std::error::Error for DecodeError {} + +/// The error type when encoding of a TESID failed. +/// +/// Encoding can fail when `encode_long` is used with a value of 2¹²⁰ or more. +/// +#[cfg_attr(feature = "std", doc = "Since **crate feature `std`** is enabled, this type implements [`std::error::Error`].")] +#[cfg_attr(not(feature = "std"), doc = "If you want this type to implement `std::error::Error`, enable **crate feature `std`**.")] +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct ValueTooLarge; + +impl fmt::Display for ValueTooLarge { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.write_str("cannot TESID-encode a value 2¹⁰⁰ or above") + } +} + +#[cfg(feature = "std")] +#[cfg_attr(docsrs, doc(cfg(feature = "std")))] +impl std::error::Error for ValueTooLarge {} + +/// The TESID coder. +#[derive(Clone)] +pub struct TesidCoder { + expanded_key: [u64; 30], +} + +impl Drop for TesidCoder { + fn drop(&mut self) { + // Might as well zero the memory in which lies the expanded key, as good security practice. + // SAFETY: the pointer is trivially valid and aligned, and not accessed concurrently. + unsafe { ptr::write_volatile(&mut self.expanded_key, Default::default()); } + atomic::compiler_fence(atomic::Ordering::SeqCst); + } +} + +impl TesidCoder { + /// Initialise a TESID coder with a key in hexadecimal string representation. + /// + /// The key string must be made up of exactly 32 lowercase hexadecimal (0-9a-f) characters, + /// and should have been generated randomly. + /// Refer to external documentation for advice on key generation. + /// + /// ``` + /// tesid::TesidCoder::new("000102030405060708090a0b0c0d0e0f").unwrap(); + /// ``` + pub fn new(key: &str) -> Option { + // from_str_radix plus checks to remove its undesired functionality (leading +, len != 16, A-F) + // won’t be quite as fast as possible, but the difference should be slight enough. + if key.len() != 32 || key.bytes().any(|b| !matches!(b, b'0'..=b'9' | b'a'..=b'f')) { + return None; + } + let key = u128::from_str_radix(key, 16).ok()?; + Some(TesidCoder { + expanded_key: fpeck::expand(key), + }) + } + + /// Encode an ID. + /// + /// ```rust + /// let coder = tesid::TesidCoder::new("000102030405060708090a0b0c0d0e0f").unwrap(); + /// assert_eq!(&*coder.encode(0), "w2ej"); + /// ``` + pub fn encode(&self, id: u64) -> InlineString<20> { + self.encode_long(id as u128).unwrap() + } + + /// Encode an ID, with sparsity and/or discrimination. + /// + /// This will panic if `id * sparsity + discriminant` is 2¹⁰⁰ or higher, which can only happen + /// if `sparsity` is 2³⁶ or higher (which would be sparsity of one in 68 billion, adding around + /// 7.2 characters to the TESID, extreme overkill for all reasonable scenarios—and that’s why + /// this method panics rather than returning a Result, because panicking just about guarantees + /// that you’re doing the wrong thing; yet I haven’t artificially capped `sparsity`, as you + /// *can* correctly use it with a higher value, so long as `id` doesn’t get too high). + // (Well, actually I kinda have artificially limited all three parameters to u64.) + /// + /// ```rust + /// // Define constants for consistent use: + /// const TYPE_SPARSITY: u64 = 256; // meaning up to 256 possible types + /// const TYPE_A: u64 = 0; + /// const TYPE_B: u64 = 1; + /// const TYPE_C: u64 = 2; + /// + /// let coder = tesid::TesidCoder::new("000102030405060708090a0b0c0d0e0f").unwrap(); + /// + /// // id 0 with three different discriminants/tags: + /// assert_eq!(&*coder.encode_with_tag(0, TYPE_SPARSITY, TYPE_A), "w2ej"); + /// assert_eq!(&*coder.encode_with_tag(0, TYPE_SPARSITY, TYPE_B), "w6um"); + /// assert_eq!(&*coder.encode_with_tag(0, TYPE_SPARSITY, TYPE_C), "x45g"); + /// + /// // id 1 with three different discriminants/tags: + /// assert_eq!(&*coder.encode_with_tag(1, TYPE_SPARSITY, TYPE_A), "dh2h"); + /// assert_eq!(&*coder.encode_with_tag(1, TYPE_SPARSITY, TYPE_B), "a6xy"); + /// assert_eq!(&*coder.encode_with_tag(1, TYPE_SPARSITY, TYPE_C), "7xgj"); + /// ``` + pub fn encode_with_tag(&self, id: u64, sparsity: u64, discriminant: u64) -> InlineString<20> { + let id = id as u128; + id.checked_mul(sparsity as u128) + .and_then(|id| id.checked_add(discriminant as u128)) + .and_then(|id| self.encode_long(id).ok()) + .expect("TesidCoder::encode_with_tag used badly") + } + + /// Attempt to encode a u128 ID. This will return an error if the ID is 2¹⁰⁰ or greater. + /// + /// ```rust + /// let coder = tesid::TesidCoder::new("000102030405060708090a0b0c0d0e0f").unwrap(); + /// assert_eq!(&*coder.encode_long(1<<70).unwrap(), "zk9d3setywjf7uwu"); + /// assert_eq!(coder.encode_long(1<<100), Err(tesid::ValueTooLarge)); + /// ``` + pub fn encode_long(&self, id: u128) -> Result, ValueTooLarge> { + // Maybe in the future I’ll be able to do an inline const pattern, but for now— + const MIN_10: u128 = 0 ; const MAX_10: u128 = (1 << 20) - 1; + const MIN_15: u128 = 1 << 20; const MAX_15: u128 = (1 << 30) - 1; + const MIN_20: u128 = 1 << 30; const MAX_20: u128 = (1 << 40) - 1; + const MIN_25: u128 = 1 << 40; const MAX_25: u128 = (1 << 50) - 1; + const MIN_30: u128 = 1 << 50; const MAX_30: u128 = (1 << 60) - 1; + const MIN_35: u128 = 1 << 60; const MAX_35: u128 = (1 << 70) - 1; + const MIN_40: u128 = 1 << 70; const MAX_40: u128 = (1 << 80) - 1; + const MIN_45: u128 = 1 << 80; const MAX_45: u128 = (1 << 90) - 1; + const MIN_50: u128 = 1 << 90; const MAX_50: u128 = (1 << 100) - 1; + const MIN_TOO_LARGE: u128 = 1 << 100; + let (n, start) = match id { + MIN_10..=MAX_10 => (10, 20 - 4), + MIN_15..=MAX_15 => (15, 20 - 6), + MIN_20..=MAX_20 => (20, 20 - 8), + MIN_25..=MAX_25 => (25, 20 - 10), + MIN_30..=MAX_30 => (30, 20 - 12), + MIN_35..=MAX_35 => (35, 20 - 14), + MIN_40..=MAX_40 => (40, 20 - 16), + MIN_45..=MAX_45 => (45, 20 - 18), + MIN_50..=MAX_50 => (50, 20 - 20), + MIN_TOO_LARGE.. => return Err(ValueTooLarge), + }; + + // SAFETY: base32::encode guarantees its output is ASCII; and start is less than buf.len(). + // (start lets us take the padding we want and discard the rest.) + Ok(InlineString { + buf: base32::encode(fpeck::encrypt(&self.expanded_key, n, id)), + start, + }) + } + + /// Decode a 64-bit ID. + /// + /// This is just `decode_long` followed by `u64::try_from`, but I wanted this for symmetry with + /// `encode`/`encode_long` which are separated because of the fallibility of the latter. By the + /// same token, I suppose if I just provided a u128-yielding method, then it’d be too tempting + /// for users to use the faulty `as u64`. Never mind that you might want an `i64` anyway (since + /// SQL databases don’t do unsigned integers very well) and thus will have to do the same + /// thing. Ah well, you can’t win them all. I got my symmetry. + // I thought briefly about a generic API so that you can .decode::(…), + // but decided that would be too often too painful to use. .decode_u128(…) / .decode_u64(…) / + // .decode_i64(…) would probably be better if I were going that way. + /// + /// ```rust + /// let coder = tesid::TesidCoder::new("000102030405060708090a0b0c0d0e0f").unwrap(); + /// assert_eq!(coder.decode("w2ej"), Ok(0)); + /// ``` + pub fn decode(&self, tesid: &str) -> Result> { + self.decode_long(tesid).and_then(|id| id.try_into().map_err(|_| DecodeError::OutOfRange)) + } + + /// Decode a 128-bit ID. + /// + /// ```rust + /// let coder = tesid::TesidCoder::new("000102030405060708090a0b0c0d0e0f").unwrap(); + /// assert_eq!(coder.decode_long("zk9d3setywjf7uwu"), Ok(1<<70)); + /// ``` + pub fn decode_long(&self, tesid: &str) -> Result> { + let (n, minimum) = match tesid.len() { + 4 => (10, 0), + 6 => (15, 1 << 20), + 8 => (20, 1 << 30), + 10 => (25, 1 << 40), + 12 => (30, 1 << 50), + 14 => (35, 1 << 60), + 16 => (40, 1 << 70), + 18 => (45, 1 << 80), + 20 => (50, 1 << 90), + _ => return Err(DecodeError::BadLength), + }; + let number = base32::decode(tesid).map_err(|()| DecodeError::BadCharacters)?; + let id = fpeck::decrypt(&self.expanded_key, n, number); + if id >= minimum { + Ok(id) + } else { + Err(DecodeError::OverlyLongEncoding) + } + } + + /// Decode an ID that was encoded with sparsity and/or discrimination. + /// + /// See [`TesidCoder::encode_with_tag`] for a description of `sparsity` and `discriminant`. + /// + /// If you want to decode supporting any discriminant, see [`TesidCoder::split_decode`]. + /// + /// ```rust + /// // Define constants for consistent use: + /// const TYPE_SPARSITY: u64 = 256; // meaning up to 256 possible types + /// const TYPE_A: u64 = 0; + /// const TYPE_B: u64 = 1; + /// const TYPE_C: u64 = 2; + /// + /// let coder = tesid::TesidCoder::new("000102030405060708090a0b0c0d0e0f").unwrap(); + /// + /// // id 0 with three different discriminants/tags: + /// assert_eq!(coder.decode_with_tag("w2ej", TYPE_SPARSITY, TYPE_A), Ok(0)); + /// assert_eq!(coder.decode_with_tag("w6um", TYPE_SPARSITY, TYPE_B), Ok(0)); + /// assert_eq!(coder.decode_with_tag("x45g", TYPE_SPARSITY, TYPE_C), Ok(0)); + /// + /// // id 1 with three different discriminants/tags: + /// assert_eq!(coder.decode_with_tag("dh2h", TYPE_SPARSITY, TYPE_A), Ok(1)); + /// assert_eq!(coder.decode_with_tag("a6xy", TYPE_SPARSITY, TYPE_B), Ok(1)); + /// assert_eq!(coder.decode_with_tag("7xgj", TYPE_SPARSITY, TYPE_C), Ok(1)); + /// + /// // And you can’t decode an ID with the wrong discriminant (presuming correct sparsity): + /// assert_eq!(coder.decode_with_tag("w2ej", TYPE_SPARSITY, TYPE_C), + /// Err(tesid::DecodeError::WrongDiscriminant { id: 0, discriminant: 0 })); + /// ``` + pub fn decode_with_tag(&self, tesid: &str, sparsity: u64, discriminant: u64) + -> Result> { + // This deliberately DOESN’T use split_decode, which requires sparsity > discriminant, + // which I don’t want to be required here. + let id = self.decode_long(tesid)?; + if let Some(id) = id.checked_sub(discriminant as u128) { + if id % sparsity as u128 == 0 { + return u64::try_from(id / sparsity as u128) + .map_err(|_| DecodeError::OutOfRange); + } + } + if sparsity <= discriminant { + // This suggests discriminant is just being used for an offset, + // not as a type tag or similar. In this situation, the discriminant is irrepairable, + // so we just declare it bad and wash our hands of it. + Err(DecodeError::OutOfRange) + } else { + Err(DecodeError::WrongDiscriminant { + id: u64::try_from(id / sparsity as u128).map_err(|_| DecodeError::OutOfRange)?, + discriminant: id as u64 % sparsity, + }) + } + } + + /// Decode an ID that was encoded with certain sparsity, + /// separating the discriminant and returning it alongside the ID. + /// + /// This is useful if you want to accept various discriminants; + /// one simple use case is better error reporting: + /// “that’s an ID for type A, but this takes IDs for type B”. + /// + /// This allows *you* to identify the discriminant, but due to the encryption, + /// anyone who has only the ID cannot; + /// if you want users to be able to discern the discriminant, + /// consider adding a human-friendly prefix to the ID; + /// I like a single uppercase letter or a word followed by an underscore. + /// + /// This requires that the discriminant be less than the sparsity, + /// or incorrect values will be produced. + // Note the use of u64 here. This also fits into the symmetry I was talking of. If you happen + // to want split_decode for IDs 2⁶⁴ or higher, well, here’s the implementation, it’s simple. + /// + /// ```rust + /// // Define constants for consistent use: + /// const TYPE_SPARSITY: u64 = 256; // meaning up to 256 possible types + /// const TYPE_A: u64 = 0; + /// const TYPE_B: u64 = 1; + /// const TYPE_C: u64 = 2; + /// + /// let coder = tesid::TesidCoder::new("000102030405060708090a0b0c0d0e0f").unwrap(); + /// + /// // id 0 with three different discriminants/tags: + /// assert_eq!(coder.split_decode("w2ej", TYPE_SPARSITY), + /// Ok(tesid::SplitDecode { id: 0, discriminant: TYPE_A })); + /// assert_eq!(coder.split_decode("w6um", TYPE_SPARSITY), + /// Ok(tesid::SplitDecode { id: 0, discriminant: TYPE_B })); + /// assert_eq!(coder.split_decode("x45g", TYPE_SPARSITY), + /// Ok(tesid::SplitDecode { id: 0, discriminant: TYPE_C })); + /// + /// // id 1 with three different discriminants/tags: + /// assert_eq!(coder.split_decode("dh2h", TYPE_SPARSITY), + /// Ok(tesid::SplitDecode { id: 1, discriminant: TYPE_A })); + /// assert_eq!(coder.split_decode("a6xy", TYPE_SPARSITY), + /// Ok(tesid::SplitDecode { id: 1, discriminant: TYPE_B })); + /// assert_eq!(coder.split_decode("7xgj", TYPE_SPARSITY), + /// Ok(tesid::SplitDecode { id: 1, discriminant: TYPE_C })); + /// ``` + pub fn split_decode(&self, tesid: &str, sparsity: u64) + -> Result, DecodeError> { + let id = self.decode_long(tesid)?; + Ok(SplitDecode { + id: u64::try_from(id / sparsity as u128) + .map_err(|_| DecodeError::OutOfRange)?, + discriminant: id as u64 % sparsity, + }) + } +} + +impl fmt::Debug for TesidCoder { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + // Hide the expanded key. + f.debug_struct("TesidCoder").finish_non_exhaustive() + } +} + +/// The output of [`TesidCoder::split_decode`], separating ID and discriminant. +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct SplitDecode { + pub id: u64, + pub discriminant: Discriminant, +} + +/// A trait to implement for type enums for their use with [`TypedTesidCoder`]. +/// +/// See [`define_type_discriminant!`] for the usual way of defining types that implement this. +pub trait TypeDiscriminant: Sized { + /// The value to use for sparsity. This must exceed the highest discriminant, + /// or discriminants equal or greater will decode incorrectly. + const SPARSITY: u64; + fn into_discriminant(self) -> u64; + fn from_discriminant(discriminant: u64) -> Option; +} + +// It would have been nice to impl TypeDiscriminant for u64, +// but I would have had to shift sparsity to a runtime parameter, +// and I don’t want to lose its place as a const on the impl. + +/// Define a type enum for use with `TypedTesidCoder`. +/// +/// This is just shorthand for defining the C-style enum written, +/// and implementing [`TypeDiscriminant`] in the obvious way on it. +/// If it’s not to your liking, you can easily do that manually. +/// +/// You might have expected a derive macro, `#[derive(tesid::TypeDiscriminant)]` with +/// `#[tesid(sparsity = 256)]` or similar, but I’ve stuck with a structural macro +/// for faster compilation and to keep zero dependencies. +/// +/// Example: +/// +/// ``` +/// tesid::define_type_discriminant!(sparsity=256, +/// #[non_exhaustive] +/// #[derive(Copy, Clone, Debug, PartialEq, Eq)] +/// pub enum Type { +/// A = 0, +/// B = 1, +/// C = 2, +/// } +/// ); +/// ``` +#[macro_export] +macro_rules! define_type_discriminant { + (sparsity = $sparsity:expr, + $(#[$attr:meta])* $vis:vis enum $name:ident { + $($variant_name:ident = $variant_value:expr),* $(,)? + }) => { + $(#[$attr])* $vis enum $name { + $($variant_name = $variant_value),* + } + + impl $crate::TypeDiscriminant for $name { + const SPARSITY: u64 = $sparsity; + + fn into_discriminant(self) -> u64 { + self as u64 + } + + fn from_discriminant(discriminant: u64) -> Option { + match discriminant { + $($variant_value => Some(Self::$variant_name),)* + _ => None, + } + } + } + }; +} + +/// A TESID coder with type discrimination baked in. +/// +/// All method examples will assume the following setup: +/// +/// ```rust +/// use tesid::{TesidCoder, TypedTesidCoder, define_type_discriminant, DecodeError, SplitDecode}; +/// +/// define_type_discriminant!(sparsity=256, +/// #[derive(Copy, Clone, Debug, PartialEq, Eq)] +/// #[non_exhaustive] +/// pub enum Type { +/// A = 0, +/// B = 1, +/// C = 2, +/// } +/// ); +/// +/// let coder = TesidCoder::new("000102030405060708090a0b0c0d0e0f").unwrap(); +/// let coder = TypedTesidCoder::::new(coder); +/// ``` +/// +/// See [`define_type_discriminant!`] for the usual way of defining a type for `D`. +#[derive(Clone)] +pub struct TypedTesidCoder { + coder: TesidCoder, + marker: PhantomData, +} + +impl fmt::Debug for TypedTesidCoder { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.debug_struct("TypedTesidCoder").finish_non_exhaustive() + } +} + +impl TypedTesidCoder { + /// Initialise a typed TESID coder. + /// + /// This takes a [`TesidCoder`] (rather than a key) so that you can clone an existing coder, + /// if you don’t always use the one sparsity and type enum. + pub fn new(coder: TesidCoder) -> Self { + Self { coder, marker: PhantomData } + } + + /// Encode an ID with a specific type. + /// + /// ```rust + /// # use tesid::{TesidCoder, TypedTesidCoder, define_type_discriminant, DecodeError, SplitDecode}; + /// # define_type_discriminant!(sparsity=256, + /// # #[derive(Copy, Clone, Debug, PartialEq, Eq)] #[non_exhaustive] + /// # pub enum Type { A = 0, B = 1, C = 2, }); + /// # let coder = TesidCoder::new("000102030405060708090a0b0c0d0e0f").unwrap(); + /// # let coder = TypedTesidCoder::::new(coder); + /// assert_eq!(&*coder.encode(Type::A, 0), "w2ej"); + /// assert_eq!(&*coder.encode(Type::B, 0), "w6um"); + /// assert_eq!(&*coder.encode(Type::A, 1), "dh2h"); + /// ``` + pub fn encode(&self, r#type: D, id: u64) -> InlineString<20> { + self.coder.encode_with_tag(id, D::SPARSITY, r#type.into_discriminant()) + } + + /// Decode an ID with a specific type. + /// + /// ```rust + /// # use tesid::{TesidCoder, TypedTesidCoder, define_type_discriminant, DecodeError, SplitDecode}; + /// # define_type_discriminant!(sparsity=256, + /// # #[derive(Copy, Clone, Debug, PartialEq, Eq)] #[non_exhaustive] + /// # pub enum Type { A = 0, B = 1, C = 2, }); + /// # let coder = TesidCoder::new("000102030405060708090a0b0c0d0e0f").unwrap(); + /// # let coder = TypedTesidCoder::::new(coder); + /// assert_eq!(coder.decode(Type::A, "w2ej"), Ok(0)); + /// assert_eq!(coder.decode(Type::B, "w6um"), Ok(0)); + /// assert_eq!(coder.decode(Type::A, "dh2h"), Ok(1)); + /// assert_eq!(coder.decode(Type::A, "w6um"), + /// Err(DecodeError::WrongDiscriminant { id: 0, discriminant: Type::B })); + /// ``` + pub fn decode(&self, r#type: D, tesid: &str) -> Result> { + self.coder.decode_with_tag(tesid, D::SPARSITY, r#type.into_discriminant()) + .map_err(Self::convert_decode_error) + } + + /// Decode an ID and type and return both. + /// + /// ```rust + /// # use tesid::{TesidCoder, TypedTesidCoder, define_type_discriminant, DecodeError, SplitDecode}; + /// # define_type_discriminant!(sparsity=256, + /// # #[derive(Copy, Clone, Debug, PartialEq, Eq)] #[non_exhaustive] + /// # pub enum Type { A = 0, B = 1, C = 2, }); + /// # let coder = TesidCoder::new("000102030405060708090a0b0c0d0e0f").unwrap(); + /// # let coder = TypedTesidCoder::::new(coder); + /// assert_eq!(coder.split_decode("w2ej"), Ok(SplitDecode { id: 0, discriminant: Type::A })); + /// assert_eq!(coder.split_decode("w6um"), Ok(SplitDecode { id: 0, discriminant: Type::B })); + /// assert_eq!(coder.split_decode("dh2h"), Ok(SplitDecode { id: 1, discriminant: Type::A })); + /// assert_eq!(coder.split_decode("6mqv"), + /// Err(DecodeError::MalformedDiscriminant { id: 0, discriminant: 3 })); + /// ``` + pub fn split_decode(&self, tesid: &str) -> Result, DecodeError> { + self.coder.split_decode(tesid, D::SPARSITY) + .and_then(|SplitDecode { id, discriminant }| Ok(SplitDecode { + id, + discriminant: D::from_discriminant(discriminant) + .ok_or(DecodeError::MalformedDiscriminant { id, discriminant })?, + })) + .map_err(Self::convert_decode_error) + } + + /// `DecodeError` → `DecodeError`, just redoing `WrongDiscriminant`. + fn convert_decode_error(e: DecodeError) -> DecodeError { + match e { + DecodeError::BadLength => DecodeError::BadLength, + DecodeError::BadCharacters => DecodeError::BadCharacters, + DecodeError::OverlyLongEncoding => DecodeError::OverlyLongEncoding, + DecodeError::OutOfRange => DecodeError::OutOfRange, + DecodeError::WrongDiscriminant { id, discriminant } => { + match D::from_discriminant(discriminant) { + Some(discriminant) => DecodeError::WrongDiscriminant { id, discriminant }, + None => DecodeError::MalformedDiscriminant { id, discriminant }, + } + }, + DecodeError::MalformedDiscriminant { id, discriminant } => + DecodeError::MalformedDiscriminant { id, discriminant }, + } + } +} + +// I don’t want to use the README as the crate docs, +// but I can still run the examples in it! +#[cfg_attr(doctest, doc = include_str!("../README.md"))] +fn _readme_doctests() {} + +#[cfg(test)] +#[allow(unused_parens)] // The constants, for consistency +mod tests { + use super::*; + + // 19 known values, 12 u64 and 7 u128. + const FIRST_4: u64 = 0 ; + const LAST_4: u64 = (1 << 20) - 1; + const FIRST_6: u64 = (1 << 20) ; + const LAST_6: u64 = (1 << 30) - 1; + const FIRST_8: u64 = (1 << 30) ; + const LAST_8: u64 = (1 << 40) - 1; + const FIRST_10: u64 = (1 << 40) ; + const LAST_10: u64 = (1 << 50) - 1; + const FIRST_12: u64 = (1 << 50) ; + const LAST_12: u64 = (1 << 60) - 1; + const FIRST_14: u64 = (1 << 60) ; + const MID_14: u64 = u64::MAX ; + const LAST_14: u128 = (1 << 70) - 1; + const FIRST_16: u128 = (1 << 70) ; + const LAST_16: u128 = (1 << 80) - 1; + const FIRST_18: u128 = (1 << 80) ; + const LAST_18: u128 = (1 << 90) - 1; + const FIRST_20: u128 = (1 << 90) ; + const LAST_20: u128 = (1 << 100) - 1; + + #[test] + fn tests() { + macro_rules! case { + ($coder:ident, sparsity=$sparsity:expr, discriminant=$discriminant:expr, split=$split:ident, id=$number:expr, tesid=$string:expr) => { + assert_eq!(&*$coder.encode_with_tag($number, $sparsity, $discriminant), $string); + assert_eq!($coder.decode_with_tag($string, $sparsity, $discriminant), Ok($number)); + $split!($coder.split_decode($string, $sparsity), Ok(SplitDecode { id: $number, discriminant: $discriminant })); + //println!("{}", $coder.encode_with_tag($number, $sparsity, $discriminant)); + }; + ($coder:ident, u64 $number:expr, $string:expr) => { + // Test it with both the long and normal methods. + case!($coder, $number as u128, $string); + assert_eq!(&*$coder.encode($number), $string); + assert_eq!($coder.decode($string), Ok($number)); + }; + ($coder:ident, $number:expr, $string:expr) => { + assert_eq!(&*$coder.encode_long($number).unwrap(), $string); + assert_eq!($coder.decode_long($string), Ok($number)); + //println!("{}", $coder.encode_long($number).unwrap()); + }; + } + + let coder = TesidCoder::new("00000000000000000000000000000000").unwrap(); + case!(coder, u64 FIRST_4 , "4kcc"); + case!(coder, u64 LAST_4 , "3rck"); + case!(coder, u64 FIRST_6 , "ju2sgs"); + case!(coder, u64 LAST_6 , "zangyh"); + case!(coder, u64 FIRST_8 , "2aux4u3h"); + case!(coder, u64 LAST_8 , "3cd7rc4h"); + case!(coder, u64 FIRST_10, "m8669y33k6"); + case!(coder, u64 LAST_10, "45e9rbrvvu"); + case!(coder, u64 FIRST_12, "t47yf553iv8t"); + case!(coder, u64 LAST_12, "cwd8t75epzje"); + case!(coder, u64 FIRST_14, "86hk4d8hj4yvcy"); + case!(coder, u64 MID_14, "sirnf2k2d2m3bm"); + case!(coder, LAST_14, "m77g4ezr3e8qay"); + case!(coder, FIRST_16, "43xf2jj6r6qm8bw4"); + case!(coder, LAST_16, "6h3wb7wytjr5tbrd"); + case!(coder, FIRST_18, "4vumjq33d8iiwaharq"); + case!(coder, LAST_18, "qd7s3csnc5yfrrud5t"); + case!(coder, FIRST_20, "jd3vsipfn69ia72chuvx"); + case!(coder, LAST_20, "628fg5kyid3z2vf2j4tf"); + + let coder = TesidCoder::new("000102030405060708090a0b0c0d0e0f").unwrap(); + case!(coder, u64 FIRST_4 , "w2ej"); + case!(coder, u64 LAST_4 , "atcw"); + case!(coder, u64 FIRST_6 , "8qwm6y"); + case!(coder, u64 LAST_6 , "3eipc7"); + case!(coder, u64 FIRST_8 , "n3md95r4"); + case!(coder, u64 LAST_8 , "nnz4z5qb"); + case!(coder, u64 FIRST_10, "st9fvria97"); + case!(coder, u64 LAST_10, "qt42fug7hq"); + case!(coder, u64 FIRST_12, "dykqxtu2ieqi"); + case!(coder, u64 LAST_12, "h7rhnw6tfhun"); + case!(coder, u64 FIRST_14, "xb5c8isevin9i3"); + case!(coder, u64 MID_14, "t62mijffzuvu4e"); + case!(coder, LAST_14, "n6n8jq6ike9dnj"); + case!(coder, FIRST_16, "zk9d3setywjf7uwu"); + case!(coder, LAST_16, "bqqei5vmzkqjfru3"); + case!(coder, FIRST_18, "z83vvq5u84sit9g7pd"); + case!(coder, LAST_18, "cpawgn8snjvverxvmp"); + case!(coder, FIRST_20, "397btwmkh5y7sjz2xu82"); + case!(coder, LAST_20, "ia2bvpjaiju7g5uaxn5t"); + + assert_eq!(coder.encode_long(1 << 100), Err(ValueTooLarge)); + assert_eq!(coder.encode_long(u128::MAX), Err(ValueTooLarge)); + + // Test misencodings: 0 is w2ej, but if 0 were encrypted with n=15 instead of n=10, + // it’d be m2eig5—so that value isn’t allowed. + // Specific value found with: + // panic!("{}", InlineString { buf: base32::encode(fpeck::encrypt(&coder.expanded_key, 15, 0)), start: 20 - 6 }); + assert_eq!(coder.decode("m2eig5"), Err(DecodeError::OverlyLongEncoding)); + + // … but slightly changed values are probably valid (since only one in 2¹⁰ is invalid). + assert_eq!(coder.decode("m2eig6"), Ok(473063752)); + + // Also a few more at the boundaries for confidence: + // LAST_4 (2²⁰−1) but encoded with n=15 instead of n=10 + assert_eq!(coder.decode("vf5fem"), Err(DecodeError::OverlyLongEncoding)); + // LAST_6 (2³⁰−1) but encoded with n=20 instead of n=15 + assert_eq!(coder.decode("ixs6h9ma"), Err(DecodeError::OverlyLongEncoding)); + // LAST_6 (2³⁰−1) but encoded with n=50 instead of n=10 + assert_eq!(coder.decode("uhkprgrirp45pe54twsa"), Err(DecodeError::OverlyLongEncoding)); + + // Bad string lengths + assert_eq!(coder.decode(""), Err(DecodeError::BadLength)); + assert_eq!(coder.decode("2"), Err(DecodeError::BadLength)); + assert_eq!(coder.decode("22"), Err(DecodeError::BadLength)); + assert_eq!(coder.decode("222"), Err(DecodeError::BadLength)); + assert_eq!(coder.decode("22222"), Err(DecodeError::BadLength)); + assert_eq!(coder.decode_long("2222222222222222222"), Err(DecodeError::BadLength)); + assert_eq!(coder.decode_long("222222222222222222222"), Err(DecodeError::BadLength)); + + // … but just so it’s clear, ones are fine, it was just the lengths that were wrong. + case!(coder, u64 173734, "2222"); + case!(coder, u64 592178178, "222222"); + case!(coder, 111515659577240532774228475483, "22222222222222222222"); + + // Now time for some tagging. + case!(coder, sparsity=1, discriminant=0, split=assert_eq, id=0, tesid="w2ej"); + case!(coder, sparsity=1, discriminant=1, split=assert_ne, id=0, tesid="w6um"); + case!(coder, sparsity=1, discriminant=2, split=assert_ne, id=0, tesid="x45g"); + + case!(coder, sparsity=100, discriminant=0, split=assert_eq, id=0, tesid="w2ej"); + case!(coder, sparsity=100, discriminant=1, split=assert_eq, id=0, tesid="w6um"); + case!(coder, sparsity=100, discriminant=2, split=assert_eq, id=0, tesid="x45g"); + case!(coder, sparsity=100, discriminant=0, split=assert_eq, id=1, tesid="ypbn"); + case!(coder, sparsity=100, discriminant=1, split=assert_eq, id=1, tesid="k9pw"); + case!(coder, sparsity=100, discriminant=2, split=assert_eq, id=1, tesid="b7nc"); + case!(coder, sparsity=100, discriminant=0, split=assert_eq, id=2, tesid="r9yc"); + case!(coder, sparsity=100, discriminant=1, split=assert_eq, id=2, tesid="arf2"); + case!(coder, sparsity=100, discriminant=2, split=assert_eq, id=2, tesid="z6wh"); + case!(coder, 000, "w2ej"); + case!(coder, 001, "w6um"); + case!(coder, 002, "x45g"); + case!(coder, 100, "ypbn"); + case!(coder, 101, "k9pw"); + case!(coder, 102, "b7nc"); + case!(coder, 200, "r9yc"); + case!(coder, 201, "arf2"); + case!(coder, 202, "z6wh"); + // The highest sparsity that’s always valid: 2³⁶ − 1 + case!(coder, sparsity=(1<<36) - 1, discriminant=u64::MAX, split=assert_ne, id=u64::MAX, tesid="fjwz5jk3p4gz9aqes22e"); + case!(coder, 1267650600228229401427983728640, "fjwz5jk3p4gz9aqes22e"); + } + + #[cfg(bench)] + #[bench] + fn bench_init(b: &mut test::Bencher) { + b.iter(|| { + TesidCoder::new(test::black_box("000102030405060708090a0b0c0d0e0f")).unwrap(); + }); + } + + #[cfg(bench)] + #[bench] + fn bench_encode(b: &mut test::Bencher) { + let coder = TesidCoder::new("000102030405060708090a0b0c0d0e0f").unwrap(); + b.iter(|| { + // This benchmark is deliberately ordered to require a different expanded key every + // time, in order to mildly thwart caching and provide a *somewhat* more realistic + // result. When it used the same expanded key twice in a row (FIRST_4, LAST_4, FIRST_6, + // LAST_6, &c.) it took around 1000ns. Once I made it use a different expanded key each + // time, that increased to around 1770ns, which is under 100ns per TESID, hopefully not + // *too* unrealistic a general figure, but it is of course not completely realistic. + for i in test::black_box([ + FIRST_4 as u128, + FIRST_6 as u128, + FIRST_8 as u128, + FIRST_10 as u128, + FIRST_12 as u128, + FIRST_14 as u128, + FIRST_16, + FIRST_18, + FIRST_20, + MID_14 as u128, + LAST_4 as u128, + LAST_6 as u128, + LAST_8 as u128, + LAST_10 as u128, + LAST_12 as u128, + LAST_14, + LAST_16, + LAST_18, + LAST_20, + ]) { + coder.encode_long(i).unwrap(); + } + }); + } + + // Indicative figure: 82ms, meaning ~80ns per encode. + #[cfg(bench)] + #[bench] + fn bench_encode_all_four_character_tesids(b: &mut test::Bencher) { + let coder = TesidCoder::new("000102030405060708090a0b0c0d0e0f").unwrap(); + b.iter(|| for i in 0..0b100000000000000000000 { + test::black_box(coder.encode(test::black_box(i))); + }) + } + + // Indicative figure: 210ms, meaning ~200ns per encode-and-decode, suggesting decode is slower + // than encode! + #[cfg(bench)] + #[bench] + fn bench_encode_and_decode_all_four_character_tesids(b: &mut test::Bencher) { + let coder = TesidCoder::new("000102030405060708090a0b0c0d0e0f").unwrap(); + b.iter(|| for i in 0..0b100000000000000000000 { + coder.decode(&coder.encode(test::black_box(i))).unwrap(); + }) + } + + #[cfg(feature = "std")] + #[test] + #[cfg_attr(debug_assertions, ignore)] // Takes a few seconds in debug mode (<0.4s in release). + // Very unscientific approach: a single run of this takes under 0.4s, which is under 400ns per + // iteration, which includes an encode, a decode, and a HashSet insert. + fn show_uniqueness_of_four_character_tesids() { + let coder = TesidCoder::new("000102030405060708090a0b0c0d0e0f").unwrap(); + let mut seen = std::collections::HashSet::new(); + for i in 0..0b100000000000000000000 { + let string = coder.encode(i); + assert_eq!(coder.decode(&string).unwrap(), i); + assert!(string.len() == 4); + assert!(seen.insert(string)); + } + } + + #[test] + fn zero_on_drop() { + let mut coder = TesidCoder::new("000102030405060708090a0b0c0d0e0f").unwrap(); + unsafe { + ptr::drop_in_place(&mut coder); + } + assert_eq!(coder.expanded_key, [0; 30]); + } +} diff --git a/rust/test b/rust/test new file mode 100755 index 00000000..73d00fd --- /dev/null +++ b/rust/test @@ -0,0 +1,14 @@ +#!/bin/sh +set -e +export RUSTFLAGS="-D warnings" +export RUSTDOCFLAGS="-D warnings" +for release in "" "--release"; do + for subcommand in clippy test doc; do + cargo $subcommand $release + cargo $subcommand $release --all-features + done +done + +RUSTFLAGS="--cfg bench" cargo +nightly bench +RUSTDOCFLAGS="-D warnings --cfg docsrs" cargo +nightly doc --all-features --no-deps +cargo msrv verify -- 2.42.0