mirror of
https://github.com/zama-ai/tfhe-rs.git
synced 2026-01-09 14:47:56 -05:00
docs(tfhe): add FHE Regex Pattern Matching Engine
this includes a tutorial and an example implementation for the regex bounty
This commit is contained in:
@@ -30,6 +30,11 @@ itertools = "0.10.5"
|
||||
num_cpus = "1.15"
|
||||
# For erf and normality test
|
||||
libm = "0.2.6"
|
||||
test-case = "*"
|
||||
combine = "*"
|
||||
anyhow = "*"
|
||||
env_logger = "*"
|
||||
log = "*"
|
||||
|
||||
[build-dependencies]
|
||||
cbindgen = { version = "0.24.3", optional = true }
|
||||
@@ -192,5 +197,9 @@ required-features = ["shortint", "internal-keycache"]
|
||||
name = "micro_bench_and"
|
||||
required-features = ["boolean"]
|
||||
|
||||
[[example]]
|
||||
name = "regex_engine"
|
||||
required-features = ["integer"]
|
||||
|
||||
[lib]
|
||||
crate-type = ["lib", "staticlib", "cdylib"]
|
||||
|
||||
22
tfhe/docs/regex/implementation-details.md
Normal file
22
tfhe/docs/regex/implementation-details.md
Normal file
@@ -0,0 +1,22 @@
|
||||
Internally the regex engine works on a vector of encrypted content characters
|
||||
(ie each content's character is encrypted individually). As a consequence this
|
||||
does mean that at least some information about the content is leaked to the
|
||||
party that is applying the regex pattern: the length of the content.
|
||||
|
||||
It parses the pattern, then generates lazily (in the sense of not yet executing
|
||||
any homomorphic operations) the list of potential homomorphic circuits that
|
||||
must each be ran exhaustively. The list is lazily generated, so as to exclude
|
||||
any pattern that is provably going to result in a false result from being
|
||||
homomorphically executed. For example, consider an application of `/^a+b$/` on
|
||||
content `acb`, then any pattern that doesn't start from the first content
|
||||
character and any pattern that does not end at the final content character can
|
||||
immediately be discarded. In this example it'd mean that we would only end up
|
||||
executing the homomorphic circuit generated to test for `aab`. Finally, each
|
||||
executed variant is then joined together with homomorphic `bitor` operations
|
||||
to reach a single result.
|
||||
|
||||
Each homomorphic operation is expensive, and so to limit any double work there
|
||||
is a cache maintained. For example, `/^a?ab/` will generate multiple circuit
|
||||
variants where `a` is homomorphically compared to a same content's character.
|
||||
The cache prevents any such recomputations from being actually recomputed; we
|
||||
already know the answer.
|
||||
51
tfhe/docs/regex/patterns.md
Normal file
51
tfhe/docs/regex/patterns.md
Normal file
@@ -0,0 +1,51 @@
|
||||
# Supported regex patterns
|
||||
|
||||
This document specifies the supported set of regex patterns in the regex engine.
|
||||
|
||||
## Components
|
||||
|
||||
A regex is described by a sequence of components surrounded by `/`, the
|
||||
following components are supported:
|
||||
|
||||
Name | Notation | Examples
|
||||
--- | --- | ---
|
||||
Character | Simply the character itself | `/a/`, `/b/`, `/Z/`, `/5/`
|
||||
Character range | `[<character>-<character]` | `/[a-d]/`, `/[C-H]`/
|
||||
Any character | `.` | `/a.c/`
|
||||
Escaped symbol | `\<symbol>` | `/\^/`, `/\$/`
|
||||
Parenthesis | `(<regex>)` | `/(abc)*/`, `/d(ab)?/`
|
||||
Optional | `<regex>?` | `/a?/`, `/(az)?/`
|
||||
Zero or more | `<regex>*` | `/a*/`, `/ab*c/`
|
||||
One or more | `<regex>+` | `/a+/`, `/ab+c/`
|
||||
Exact repeat | `<regex{<number>}>` | `/ab{2}c/`
|
||||
At least repeat | `<regex{<number>,}>` | `/ab{2,}c/`
|
||||
At most repeat | `<regex{,<number>}>` | `/ab{,2}c/`
|
||||
Repeat between | `<regex{<number>,<number>}>` | `/ab{2,4}c/`
|
||||
Either | `<regex>\|<regex>` | `/a\|b/`, `/ab\|cd/`
|
||||
Start matching | `/^<regex>` | `/^abc/`
|
||||
End matching | `<regex>$/` | `/abc$/`
|
||||
|
||||
## Modifiers
|
||||
|
||||
Modifiers are mode selectors that affect the entire regex's behavior. At the
|
||||
moment there is 1 modifier supported:
|
||||
|
||||
- Case insensitive matching, by appending an `i` after the regex pattern. For example: `/abc/i`
|
||||
|
||||
## General examples
|
||||
|
||||
These components and modifiers can be combined to form any desired regex
|
||||
pattern. To give some idea of what's possible, here is a non-exhaustive list of
|
||||
supported regex patterns:
|
||||
|
||||
Pattern | Description
|
||||
--- | ---
|
||||
`/^abc$/` | Matches with content that equals exactly `abc` (case sensitive)
|
||||
`/^abc$/i` | Matches with content that equals `abc` (case insensitive)
|
||||
`/abc/` | Matches with content that contains somewhere `abc`
|
||||
`/ab?c/` | Matches with content that contains somewhere `abc` or somwhere `ab`
|
||||
`/^ab*c$/` | For example, matches with: `ac`, `abc`, `abbbbc`
|
||||
`/^[a-c]b\|cd$/` | Matches with: `ab`, `bb`, `cb`, `cd`
|
||||
`/^[a-c]b\|cd$/i` | Matches with: `ab`, `Ab`, `aB`, ..., `cD`, `CD`
|
||||
`/^d(abc)+d$/` | For example, matches with: `dabcd`, `dabcabcd`, `dabcabcabcd`
|
||||
`/^a.*d$/` | Matches with any content that starts with `a` and ends with `d`
|
||||
466
tfhe/docs/regex/tutorial.md
Normal file
466
tfhe/docs/regex/tutorial.md
Normal file
@@ -0,0 +1,466 @@
|
||||
# FHE Regex Pattern Matching Tutorial
|
||||
|
||||
This tutorial explains how to build a Regex pattern matching engine where the
|
||||
content that is matched against is ciphertext.
|
||||
|
||||
A regex Pattern Matching Engine (PME) is an essential tool for programmers. It
|
||||
allows to perform complex searches on a content. The less powerful simple
|
||||
search on string can only find matches of the exact given sequence of
|
||||
characters (eg, your Browser's default search function does this). Regex PME
|
||||
are more powerful, it allows to search on certain structures of text (where a
|
||||
structure may take form in multiple possible sequences of characters). The
|
||||
structure to be searched for is defined with the regex, a very concise
|
||||
language. Here are some example regexes to give some idea of what is possible:
|
||||
|
||||
Regex | Semantics
|
||||
--- | ---
|
||||
/abc/ | Searches for the sequence `abc` (equivalent to the simple text search)
|
||||
/^abc/ | Searches for the sequence `abc` at the beginning of the content
|
||||
/a?bc/ | Searches for sequences `abc`, `bc`
|
||||
/ab\|c+d/ | Searches for sequences of `ab`, `c` repeated 1 or more times followed by `d`
|
||||
|
||||
Regexes are powerful enough to be able to express structures like email address
|
||||
formats. Capability like this is what makes regexes useful for many programming
|
||||
solutions.
|
||||
|
||||
There are two main components identifiable in a PME:
|
||||
1. the pattern that is to be matched has to be parsed, this translates from a
|
||||
textual representation into a recursively structured object (an Abstract
|
||||
Syntax Tree, or AST)
|
||||
2. this AST has to then be applied to the text that is to be matched against,
|
||||
resulting in a yes or no to whether the pattern matched (and in the case of
|
||||
our FHE implementation, this result is an encrypted yes or an encrypted no)
|
||||
|
||||
Parsing is a well understood problem. There are a couple of different
|
||||
approaches possible here. Regardless of the approach chosen, it starts with
|
||||
figuring out what the language is that we want to support. That is, what are
|
||||
the kinds of sentences that we want our regex language to include? A few
|
||||
example sentences we definitely want to support are for example: `/a/`,
|
||||
`/a?bc/`, `/^ab$/`, `/ab|cd/`, however example sentences don't suffice here as
|
||||
a specification because they can never be exhaustive (they're endless). We need
|
||||
something to specify _exactly_ the full set of sentences our language supports.
|
||||
There exists a language that can help us describe exactly what our own
|
||||
language's structure is: Grammars.
|
||||
|
||||
## The Grammar and Datastructure
|
||||
|
||||
It is useful to start with defining the Grammar before starting to write
|
||||
the code for the parser. Because the code structure follows directly from the
|
||||
Grammar. A Grammar consists of a (generally small) set of rules. For example,
|
||||
a very basic Grammar could look like this:
|
||||
```
|
||||
Start := 'a'
|
||||
```
|
||||
this describes a language that only contains the sentence "a". Not a very interesting language.
|
||||
|
||||
We can make it more interesting though by introducing choice into the grammar
|
||||
with \| (called a 'pipe') operators. If we want the above grammar to accept
|
||||
either "a" or "b":
|
||||
```
|
||||
Start := 'a' | 'b'
|
||||
```
|
||||
|
||||
So far only Grammars with a single rule have been shown. However, a Grammar can
|
||||
consist of multiple rules. And in fact, most languages require to be defined
|
||||
over multiple rules. Lets consider a more meaningful language, one that accepts
|
||||
sentences consisting of one or more digits, we could describe such a language
|
||||
with the following Grammar:
|
||||
```
|
||||
Start := Digit+
|
||||
|
||||
Digit := '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9'
|
||||
```
|
||||
|
||||
The `+` above after `Digit` is another Grammar operator. With it we specify that
|
||||
Digit must be matched 1 or more times. Here are all the Grammar operators that
|
||||
are relevant for this tutorial:
|
||||
|
||||
Operator | Example | Semantics
|
||||
--- | --- | ---
|
||||
`\|` | a \| b | we first try matching on a, on no match we try to match on b
|
||||
`+` | a+ | match a 1 or more times
|
||||
`*` | a* | match a any amount of times (including zero times)
|
||||
`?` | a? | optionally match a (match 0 or 1 time)
|
||||
`.` | . | match any character
|
||||
`..` | a .. b | match on a range of alphabetically ordered characters from a to (and including) b
|
||||
` ` | a b | sequencing; match on a and then on b
|
||||
|
||||
In the case of the example PME the grammar is as follows (notice the unquoted ? and quoted ? etc., the unquoted are Grammar operators and the quoted are characters we are matching in the parsing)
|
||||
```
|
||||
Start := '/' '^'? Regex '$'? '/' Modifier?
|
||||
|
||||
Regex := Term '|' Term
|
||||
| Term
|
||||
|
||||
Term := Factor*
|
||||
|
||||
Factor := Atom '?'
|
||||
| Repeated
|
||||
| Atom
|
||||
|
||||
Repeated := Atom '*'
|
||||
| Atom '+'
|
||||
| Atom '{' Digit* ','? '}'
|
||||
| Atom '{' Digit+ ',' Digit* '}'
|
||||
|
||||
Atom := '.'
|
||||
| '\' .
|
||||
| Character
|
||||
| '[' Range ']'
|
||||
| '(' Regex ')'
|
||||
|
||||
Range := '^' Range
|
||||
| AlphaNum '-' AlphaNum
|
||||
| AlphaNum+
|
||||
|
||||
Digit := '0' .. '9'
|
||||
|
||||
Character := AlphaNum
|
||||
| '&' | ';' | ':' | ',' | '`' | '~' | '-' | '_' | '!' | '@' | '#' | '%' | '\'' | '\"'
|
||||
|
||||
AlphaNum := 'a' .. 'z'
|
||||
| 'A' .. 'Z'
|
||||
| '0' .. '9'
|
||||
|
||||
Modifier := 'i'
|
||||
```
|
||||
Below will refer occasionally to specific parts in the Grammar above by \<rule name\>.\<variant index\> (where the first rule variant has index 1).
|
||||
|
||||
With the Grammar defined, we can start defining a type to parse into. In Rust we
|
||||
have the `enum` kind of type that is perfect for this, as it allows to define
|
||||
multiple variants that may recurse. I prefer to start by defining variants that
|
||||
do not recurse (ie that don't contain nested regex expressions):
|
||||
```rust
|
||||
enum RegExpr {
|
||||
Char { c: char }, // matching against a single character (Atom.2 and Atom.3)
|
||||
AnyChar, // matching _any_ character (Atom.1)
|
||||
SOF, // matching only at the beginning of the content ('^' in Start.1)
|
||||
EOF, // matching only at the end of the content (the '$' in Start.1)
|
||||
Range { cs: Vec<char> }, // matching on a list of characters (Range.3, eg '[acd]')
|
||||
Between { from: char, to: char }, // matching between 2 characters based on ascii ordering (Range.2, eg '[a-g]')
|
||||
}
|
||||
```
|
||||
|
||||
With this we would already be able to translate the following basic regexes:
|
||||
|
||||
Pattern | RegExpr value
|
||||
--- | ---
|
||||
`/a/` | `RegExpr::Char { c: 'a' }`
|
||||
`/\\^/` | `RegExpr::Char { c: '^' }`
|
||||
`/./` | `RegExpr::AnyChar`
|
||||
`/^/` | `RegExpr::SOF`
|
||||
`/$/` | `RegExpr::EOF`
|
||||
`/[acd]/` | `RegExpr::Range { vec!['a', 'c', 'd'] }`
|
||||
`/[a-g]/` | `RegExpr::Between { from: 'a', to: 'g' }`
|
||||
|
||||
Notice we're not yet able to sequence multiple components together. Lets define
|
||||
the first variant that captures recursive RegExpr for this:
|
||||
```rust
|
||||
enum RegExpr {
|
||||
...
|
||||
Seq { re_xs: Vec<RegExpr> }, // matching sequences of RegExpr components (Term.1)
|
||||
}
|
||||
```
|
||||
With this Seq (short for sequence) variant we allow translating patterns that
|
||||
contain multiple components:
|
||||
|
||||
Pattern | RegExpr value
|
||||
--- | ---
|
||||
`/ab/` | `RegExpr::Seq { re_xs: vec![RegExpr::Char { c: 'a' }, RegExpr::Char { c: 'b' }] }`
|
||||
`/^a.$/` | `RegExpr::Seq { re_xs: vec![RegExpr::SOF, RexExpr::Char { 'a' }, RegExpr::AnyChar, RegExpr::EOF] }`
|
||||
`/a[f-l]/` | `RegExpr::Seq { re_xs: vec![RegExpr::Char { c: 'a' }, RegExpr::Between { from: 'f', to: 'l' }] }`
|
||||
|
||||
Lets finish the RegExpr datastructure by adding variants for optional matching,
|
||||
the not logic in a range, and the either left or right matching:
|
||||
```rust
|
||||
enum RegExpr {
|
||||
...
|
||||
Optional { opt_re: Box<RegExpr> }, // matching optionally (Factor.1)
|
||||
Not { not_re: Box<RegExpr> }, // matching inversely on a range (Range.1)
|
||||
Either { l_re: Box<RegExpr>, r_re: Box<RegExpr> }, // matching the left or right regex (Regex.1)
|
||||
}
|
||||
```
|
||||
|
||||
Some features may make most sense to be implemented through post processing of
|
||||
the parsed datastructure. For example, the case insensitivity feature (the `i`
|
||||
Modifier) is implemented in the example implementation by taking the parsed
|
||||
RegExpr, and mutating every character mentioned inside to cover both the lower
|
||||
case as well as the upper case variant (see function `case_insensitive` in
|
||||
`parser.rs` for the example implementation of this).
|
||||
|
||||
The modifier `i` in our Grammar (for enabling case insensitivity) seemed easiest
|
||||
to implement by applying a post processing step to the parser.
|
||||
|
||||
We are now able to translate any complex regex into a RegExpr value. For example:
|
||||
|
||||
Pattern | RegExpr value
|
||||
--- | ---
|
||||
`/a?/` | `RegExpr::Optional { opt_re: Box::new(RegExpr::Char { c: 'a' }) }`
|
||||
`/[a-d]?/` | `RegExpr::Optional { opt_re: Box::new(RegExpr::Between { from: 'a', to: 'd' }) }`
|
||||
`/[^ab]/` | `RegExpr::Not { not_re: Box::new(RegExpr::Range { cs: vec!['a', 'b'] }) }`
|
||||
`/av\|d?/` | `RegExpr::Either { l_re: Box::new(RegExpr::Seq { re_xs: vec![RegExpr::Char { c: 'a' }, RegExpr::Char { c: 'v' }] }), r_re: Box::new(RegExpr::Optional { opt_re: Box::new(RegExpr::Char { c: 'd' }) }) }`
|
||||
`/(av\|d)?/` | `RegExpr::Optional { opt_re: Box::new(RegExpr::Either { l_re: Box::new(RegExpr::Seq { re_xs: vec![RegExpr::Char { c: 'a' }, RegExpr::Char { c: 'v' }] }), r_re: Box::new(RegExpr::Char { c: 'd' }) }) }`
|
||||
|
||||
With both the Grammar and the datastructure to parse into defined, we can now
|
||||
start implementing the actual parsing logic. There are multiple ways this can
|
||||
be done. For example there exist tools that can automatically generate parser
|
||||
code by giving it the Grammar definition (these are called parser generators).
|
||||
However, I prefer to write parsers myself with a parser combinator library.
|
||||
Because, in my opinion the behavior in runtime is better understandable of
|
||||
parsers constructed with a parser combinator library than of parsers that were
|
||||
generated with a parser generator tool.
|
||||
|
||||
In Rust there exist a number of popular parser combinator libraries, I went
|
||||
with `combine` but any other would work just as well. Choose whichever appeals
|
||||
the most to you (including any parser generator tool). The implementation of
|
||||
our regex parser will differ significantly depending on the approach you choose
|
||||
and as such I think it is better to omit this part from the tutorial. You may
|
||||
look at the parser code in the example implementation to get an idea on how
|
||||
this could be done. In general though the Grammar and the datastructure are
|
||||
the important components, the parser code follows directly from these.
|
||||
|
||||
## Matching the RegExpr to Encrypted Content
|
||||
|
||||
The next challenge is to build the execution engine, where we take a RegExpr
|
||||
value and recurse into it to apply the necessary actions on the encrypted
|
||||
content. We first have to define how we actually encode our content into an
|
||||
encrypted state. Once that is defined we can start working on how we will
|
||||
execute our RegExpr onto the encrypted content.
|
||||
|
||||
### Encoding and Encrypting the Content
|
||||
|
||||
It is not possible to encrypt the entire content into a single encrypted value,
|
||||
we can only encrypt numbers and do operations on those encrypted numbers with
|
||||
FHE. Therefore we have to find a scheme where we encode the content into a
|
||||
sequence of numbers that then are encrypted individually, to form a sequence of
|
||||
encrypted numbers.
|
||||
|
||||
I saw two strategies (though there may be additional, potentially better,
|
||||
ways):
|
||||
1. map each character of the content into the u8 ascii value, and then encrypt
|
||||
each bit of these u8 values individually.
|
||||
2. instead of encrypting each bit individually, encrypt each u8 ascii value in
|
||||
its entirety.
|
||||
|
||||
Even though strategy 1 would require more highlevel TFHE-rs operations to check
|
||||
for even a simple characther match (we have to check each bit individually for
|
||||
equality, as opposed to checking the entire byte in 1 highlevel TFHE-rs
|
||||
operation), some experimentation did show that these options both performed
|
||||
relatively equally well on a regex like `/a/`. I suppose this is because
|
||||
bitwise FHE operations are relatively cheap compared to u8 FHE operations.
|
||||
However, option 1 falls apart as soon as you introduce the '[a-z]' regex logic.
|
||||
Because with option 2, it's possible to complete this match with just 3 TFHE-rs
|
||||
operations:
|
||||
```rust
|
||||
// note: this is pseudocode
|
||||
c = <the encrypted character under inspection>;
|
||||
sk = <the server key, aka the public key>
|
||||
|
||||
ge_from = sk.ge(c, 'a');
|
||||
le_to = sk.le(c, 'z');
|
||||
result = sk.bitand(ge_from, le_to);
|
||||
```
|
||||
`ge`, `le`, and `bitand` are the 3 cryptographic operations here.
|
||||
|
||||
If on the other hand we had encrypted the content with the first strategy,
|
||||
there would be no way to test for `greater/equal than from` and `less/equal
|
||||
than to`. We'd have to check for potential equality of each character between
|
||||
`from` and `to`, and then join the results together with a sequence of
|
||||
`sk.bitor`; way more cryptographic operations than in strategy 2.
|
||||
|
||||
Because FHE operations are computationally expensive, and strategy 1 requires
|
||||
significantly more FHE operations for matching on `[a-z]` regex logic, we
|
||||
should opt for strategy 2.
|
||||
|
||||
### Matching with the AST Versus Matching with a derived DFA
|
||||
|
||||
There are a lot of regex pattern matching engines. It's been built many times
|
||||
and it's been researched thoroughly. There are different strategies possible
|
||||
here. A straight forward strategy is to directly recurse into our RegExpr
|
||||
value and apply the necessary matching operations onto the content. In a way
|
||||
this is nice, because it allows us to link the RegExpr structure directly to
|
||||
the matching semantics. Resulting in code that is easier to
|
||||
understand/maintain/etc.
|
||||
|
||||
Alternatively, there exists an algorithm that transforms the AST (ie the
|
||||
RegExpr in our case) into a Deterministic Finite Automata (DFA). Normally this
|
||||
is a favorable approach in terms of efficiency, because the derived DFA can be
|
||||
walked over without needing to backtrack (whereas the former strategy cannot
|
||||
prevent backtracking). This means that the content can be walked over from
|
||||
character to character, and depending on what the character exactly is at this
|
||||
cursor, the DFA is conjuctively traveled in a definite direction which
|
||||
ultimately leads us to the `yes, there is a match` or the `no, there is no
|
||||
match`. There is a small upfront cost of having to translate the AST into the
|
||||
DFA, but the lack of backtracking during the matching generally makes up for
|
||||
this (especially if the content that is matched against is significantly big).
|
||||
|
||||
In our case though we are matching on encrypted content. We have no way to know
|
||||
what the character at our cursor is, and therefore no way to find this definite
|
||||
direction to go forward to in the DFA. Therefore, I don't think that
|
||||
translating the AST into the DFA helps us the way it does in normal regex
|
||||
pattern matching engines. And for this reason I opted for the former strategy,
|
||||
because it allows for matching logic that is easier to understand.
|
||||
|
||||
### Matching
|
||||
|
||||
In the previous section we decided we'll match by traversing into the RegExpr
|
||||
value. This section will explain exactly how to do that. Similarly to defining
|
||||
the Grammar, I find it is best to start with working out the non recursive
|
||||
RegExpr variants.
|
||||
|
||||
We'll start by defining the function that will recursively traverse into the RegExpr value:
|
||||
```rust
|
||||
|
||||
type StringCiphertext = Vec<RadixCiphertextBig>;
|
||||
type ResultCiphertext = RadixCiphertextBig;
|
||||
|
||||
fn match(
|
||||
sk: &ServerKey,
|
||||
content: &StringCipherText,
|
||||
re: &RegExpr,
|
||||
content_pos: usize,
|
||||
) -> Vec<(ResultCiphertext, usize)> {
|
||||
let content_char = &content[c_pos];
|
||||
match re {
|
||||
...
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
`sk` is the server key (aka public key),`content` is what we'll be matching
|
||||
against, `re` is the RegExpr value we built when parsing the regex, and `c_pos`
|
||||
is the cursor position (the index in content we are currently matching
|
||||
against).
|
||||
|
||||
The result is a vector of tuples, with first value of the tuple the so far
|
||||
computed ciphertext result, and second value the content position after the
|
||||
regex components were applied. It's a vector, because certain RegExpr variants
|
||||
require to consider a list of possible execution paths. For example, the
|
||||
RegExpr::Optional might succeed by applying _or_ by *not* applying the optional
|
||||
regex (and notice that in the former case `c_pos` moves forward whereas in the
|
||||
latter case it stays put).
|
||||
|
||||
On first call to `match` the entire regex pattern is matched starting with
|
||||
`c_pos=0`, then `match` is called again for the entire regex pattern with
|
||||
`c_pos=1`, etc. until `c_pos` exceeds the length of the content. Each of these
|
||||
alternative matches results are then joined together with `sk.bitor` operations
|
||||
(this works out correctly because if 1 of them results in true, then this means
|
||||
our matching algorithm in general should return true).
|
||||
|
||||
The `...` within the match statement above is what we will be working out for
|
||||
some of the RegExpr variants now. Starting with `RegExpr::Char`:
|
||||
```rust
|
||||
case RegExpr::Char { c } => {
|
||||
vec![(sk.eq(content_char, c), c_pos + 1)]
|
||||
},
|
||||
```
|
||||
|
||||
Lets consider an example of above's variant, if we apply `/a/` to content
|
||||
`bac`, we'd have the following list of `match` calls' `re` and `c_pos` values
|
||||
(for simplicity `re` is denoted in regex pattern instead of in RegExpr value):
|
||||
|
||||
re | c\_pos | Ciphertext operation
|
||||
--- | --- | ---
|
||||
/a/ | 0 | sk.eq(content[0], a)
|
||||
/a/ | 1 | sk.eq(content[1], a)
|
||||
/a/ | 2 | sk.eq(content[2], a)
|
||||
|
||||
And we would arrive at the following sequence of Ciphertext operations:
|
||||
```
|
||||
sk.bitor(sk.eq(content[0], a), sk.bitor(sk.eq(content[1], a), sk.eq(content[2], a)))
|
||||
```
|
||||
|
||||
AnyChar is a no operation:
|
||||
```rust
|
||||
case RegExpr::AnyChar => {
|
||||
// note: ct_true is just some constant representing True that is trivially encoded into ciphertext
|
||||
return vec![(ct_true, c_pos + 1)];
|
||||
}
|
||||
```
|
||||
|
||||
Sequence iterates over its `re_xs`, increasing the content position
|
||||
accordingly, and joins the results with `bitand` operations:
|
||||
```rust
|
||||
case RegExpr::Seq { re_xs } => {
|
||||
re_xs.iter().fold(|prev_results, re_x| {
|
||||
prev_results.iter().flat_map(|(prev_res, prev_c_pos)| {
|
||||
(x_res, new_c_pos) = match(sk, content, re_x, prev_c_pos);
|
||||
(sk.bitand(prev_res, x_res), new_c_pos)
|
||||
})
|
||||
}, (ct_true, c_pos))
|
||||
},
|
||||
```
|
||||
|
||||
Other variants are similar, they recurse and manipulate `re` and `c_pos`
|
||||
accordingly. Hopefully the general idea is already clear.
|
||||
|
||||
Ultimately the entire pattern matching logic unfolds into a sequence of just
|
||||
the following set of FHE operations:
|
||||
1. eq (tests for an exact character match)
|
||||
2. ge (tests for greater than or equal to a character)
|
||||
3. le (tests for less than or equal to a character)
|
||||
4. bitand (bitwise AND, used for sequencing multiple regex components)
|
||||
5. bitor (bitwise OR, used for folding multiple possible execution variants'
|
||||
results into a single result)
|
||||
6. bitxor (bitwise XOR, used for the not logic in ranges)
|
||||
|
||||
### Optimizations
|
||||
|
||||
Generally the included example PME follows above approach. However, there were
|
||||
two additional optimizations applied. Both of these optimizations involved
|
||||
reducing the number of unnecessary FHE operations. I think that given how
|
||||
computationally expensive these operations are, it only makes sense to optimize
|
||||
for this (and to for example ignore any suboptimal memory usage of our PME,
|
||||
etc.).
|
||||
|
||||
The first optimization involved delaying execution of FHE operations to _after_
|
||||
generation of all the possible execution paths that have to be considered. This
|
||||
optimization allows us to prune execution paths during execution path
|
||||
construction that are provably going to result in an encrypted false value,
|
||||
without having already performed the FHE operations up to the point of pruning.
|
||||
Consider for example the regex `/^a+b$/`, and we are applying this to a content
|
||||
of size 4. If we're executing execution paths naively, we would go ahead and
|
||||
check for all possible amount of `a` repetitions: `ab`, `aab`, `aaab`.
|
||||
However, while building the execution paths, we can use the fact that `a+` must
|
||||
begin at the beginning of the content, and that `b` must be the final character
|
||||
of the content. From this follows that we only have to check for the following
|
||||
sentence: `aaab`. Delaying execution of the FHE operations til after we've
|
||||
built the possible execution paths in this example reduced the number of FHE
|
||||
operations applied by half approximately!
|
||||
|
||||
The second optimization involved preventing the same FHE conditions to be
|
||||
re-evaluated. Consider the regex `/^a?ab/`, this would give us the following
|
||||
possible execution paths that must be considered:
|
||||
1. `content[0] == a && content[1] == a && content[2] == b` (we match the `a` in
|
||||
`a?`)
|
||||
2. `content[0] == a && content[1] == b` (we don't match the `a` in `a?`)
|
||||
|
||||
Notice that for both execution paths we are checking for `content[0] == a`.
|
||||
Even though we cannot see what the encrypted result is, we do know that it's
|
||||
either going to be an encrypted false for both cases or an encrypted true for
|
||||
both cases. Therefore, we can skip the re-evaluation of `content[0] == a` and
|
||||
simply copy the result from the first evaluation over. This optimization
|
||||
involved maintaining a cache of known expression evaluations' results, and
|
||||
reusing those where possible.
|
||||
|
||||
# Trying out the example implementation
|
||||
|
||||
The implementation that guided the writing of this tutorial can be found
|
||||
under `thfe/examples/regex_engine`.
|
||||
|
||||
When compiling with `--example regex_engine`, a binary is produced that serves
|
||||
as a basic demo. Simply call it with first argument the content string and
|
||||
second argument the pattern string. For example,
|
||||
`cargo run --release --features=x86_64-unix,integer --example regex_engine -- 'this is the content' '/^pattern$/'`;
|
||||
note it's advicable to compile the executable with `--release` flag as the key
|
||||
generation and homomorphic operations otherwise seem to experience a heavy
|
||||
performance penalty.
|
||||
|
||||
On execution it first creates a private and public key pair. It then encrypts
|
||||
the content with the client key, and applies the regex pattern onto the
|
||||
encrypted content string - with only access to the server key. Finally, it
|
||||
decrypts the resulting encrypted result using the client key and prints the
|
||||
verdict to the console.
|
||||
|
||||
To get some more information on what exactly it is doing, set the `RUST_LOG`
|
||||
environment variable to `debug` or to `trace`.
|
||||
21
tfhe/examples/regex_engine/ciphertext.rs
Normal file
21
tfhe/examples/regex_engine/ciphertext.rs
Normal file
@@ -0,0 +1,21 @@
|
||||
use tfhe::shortint::parameters::PARAM_MESSAGE_2_CARRY_2;
|
||||
use tfhe::integer::gen_keys_radix;
|
||||
use tfhe::integer::{RadixCiphertextBig, RadixClientKey, ServerKey};
|
||||
use anyhow::{Result, anyhow};
|
||||
|
||||
pub type StringCiphertext = Vec<RadixCiphertextBig>;
|
||||
|
||||
pub fn encrypt_str(client_key: &RadixClientKey, s: &str) -> Result<StringCiphertext> {
|
||||
if !s.is_ascii() {
|
||||
return Err(anyhow!("content contains non-ascii characters"));
|
||||
}
|
||||
Ok(s.as_bytes()
|
||||
.iter()
|
||||
.map(|byte| client_key.encrypt(*byte as u64))
|
||||
.collect())
|
||||
}
|
||||
|
||||
pub fn gen_keys() -> (RadixClientKey, ServerKey) {
|
||||
let num_block = 4;
|
||||
gen_keys_radix(&PARAM_MESSAGE_2_CARRY_2, num_block)
|
||||
}
|
||||
266
tfhe/examples/regex_engine/engine.rs
Normal file
266
tfhe/examples/regex_engine/engine.rs
Normal file
@@ -0,0 +1,266 @@
|
||||
use anyhow::Result;
|
||||
use std::rc::Rc;
|
||||
use tfhe::integer::{RadixCiphertextBig, ServerKey};
|
||||
use crate::parser::{parse, RegExpr};
|
||||
use crate::execution::{Executed, Execution, LazyExecution};
|
||||
|
||||
pub fn has_match(
|
||||
sk: &ServerKey,
|
||||
content: &[RadixCiphertextBig],
|
||||
pattern: &str,
|
||||
) -> Result<RadixCiphertextBig> {
|
||||
let re = parse(pattern)?;
|
||||
|
||||
let branches: Vec<LazyExecution> = (0..content.len())
|
||||
.flat_map(|i| build_branches(content, &re, i))
|
||||
.map(|(lazy_branch_res, _)| lazy_branch_res)
|
||||
.collect();
|
||||
|
||||
let mut exec = Execution::new(sk.clone());
|
||||
|
||||
let res = if branches.len() <= 1 {
|
||||
branches
|
||||
.get(0)
|
||||
.map_or(exec.ct_false(), |branch| branch(&mut exec))
|
||||
.0
|
||||
} else {
|
||||
branches[1..]
|
||||
.into_iter()
|
||||
.fold(branches[0](&mut exec), |res, branch| {
|
||||
let branch_res = branch(&mut exec);
|
||||
exec.ct_or(res, branch_res)
|
||||
})
|
||||
.0
|
||||
};
|
||||
info!(
|
||||
"{} ciphertext operations, {} cache hits",
|
||||
exec.ct_operations_count(),
|
||||
exec.cache_hits(),
|
||||
);
|
||||
Ok(res)
|
||||
}
|
||||
|
||||
fn build_branches(
|
||||
content: &[RadixCiphertextBig],
|
||||
re: &RegExpr,
|
||||
c_pos: usize,
|
||||
) -> Vec<(LazyExecution, usize)> {
|
||||
trace!("program pointer: regex={:?}, content pos={}", re, c_pos);
|
||||
match re {
|
||||
RegExpr::SOF => {
|
||||
if c_pos == 0 {
|
||||
return vec![(Rc::new(|exec| exec.ct_true()), c_pos)];
|
||||
} else {
|
||||
return vec![];
|
||||
}
|
||||
}
|
||||
RegExpr::EOF => {
|
||||
if c_pos == content.len() {
|
||||
return vec![(Rc::new(|exec| exec.ct_true()), c_pos)];
|
||||
} else {
|
||||
return vec![];
|
||||
}
|
||||
}
|
||||
_ => (),
|
||||
};
|
||||
|
||||
if c_pos >= content.len() {
|
||||
return vec![];
|
||||
}
|
||||
|
||||
match re.clone() {
|
||||
RegExpr::Char { c } => {
|
||||
let c_char = (content[c_pos].clone(), Executed::ct_pos(c_pos));
|
||||
vec![(
|
||||
Rc::new(move |exec| exec.ct_eq(c_char.clone(), exec.ct_constant(c))),
|
||||
c_pos + 1,
|
||||
)]
|
||||
}
|
||||
RegExpr::AnyChar => vec![(Rc::new(|exec| exec.ct_true()), c_pos + 1)],
|
||||
RegExpr::Not { not_re } => build_branches(content, ¬_re, c_pos)
|
||||
.into_iter()
|
||||
.map(|(branch, c_pos)| {
|
||||
(
|
||||
Rc::new(move |exec: &mut Execution| {
|
||||
let branch_res = branch(exec);
|
||||
exec.ct_not(branch_res)
|
||||
}) as LazyExecution,
|
||||
c_pos,
|
||||
)
|
||||
})
|
||||
.collect(),
|
||||
RegExpr::Either { l_re, r_re } => {
|
||||
let mut res = build_branches(content, &l_re, c_pos);
|
||||
res.append(&mut build_branches(content, &r_re, c_pos));
|
||||
res
|
||||
}
|
||||
RegExpr::Between { from, to } => {
|
||||
let c_char = (content[c_pos].clone(), Executed::ct_pos(c_pos));
|
||||
vec![(
|
||||
Rc::new(move |exec| {
|
||||
let ct_from = exec.ct_constant(from);
|
||||
let ct_to = exec.ct_constant(to);
|
||||
let ge_from = exec.ct_ge(c_char.clone(), ct_from);
|
||||
let le_to = exec.ct_le(c_char.clone(), ct_to);
|
||||
exec.ct_and(ge_from, le_to)
|
||||
}),
|
||||
c_pos + 1,
|
||||
)]
|
||||
}
|
||||
RegExpr::Range { cs } => {
|
||||
let c_char = (content[c_pos].clone(), Executed::ct_pos(c_pos));
|
||||
vec![(
|
||||
Rc::new(move |exec| {
|
||||
cs[1..].iter().fold(
|
||||
exec.ct_eq(c_char.clone(), exec.ct_constant(cs[0])),
|
||||
|res, c| {
|
||||
let ct_c_char_eq = exec.ct_eq(c_char.clone(), exec.ct_constant(*c));
|
||||
exec.ct_or(res, ct_c_char_eq)
|
||||
},
|
||||
)
|
||||
}),
|
||||
c_pos + 1,
|
||||
)]
|
||||
}
|
||||
RegExpr::Repeated {
|
||||
repeat_re,
|
||||
at_least,
|
||||
at_most,
|
||||
} => {
|
||||
let at_least = at_least.unwrap_or(0);
|
||||
let at_most = at_most.unwrap_or(content.len() - c_pos);
|
||||
|
||||
if at_least > at_most {
|
||||
return vec![];
|
||||
}
|
||||
|
||||
let mut res = vec![
|
||||
if at_least == 0 {
|
||||
vec![(
|
||||
Rc::new(|exec: &mut Execution| exec.ct_true()) as LazyExecution,
|
||||
c_pos,
|
||||
)]
|
||||
} else {
|
||||
vec![]
|
||||
},
|
||||
build_branches(
|
||||
content,
|
||||
&(RegExpr::Seq {
|
||||
re_xs: std::iter::repeat(*repeat_re.clone())
|
||||
.take(std::cmp::max(1, at_least))
|
||||
.collect(),
|
||||
}),
|
||||
c_pos,
|
||||
),
|
||||
];
|
||||
|
||||
for _ in (at_least + 1)..(at_most + 1) {
|
||||
res.push(
|
||||
res.last()
|
||||
.unwrap()
|
||||
.iter()
|
||||
.flat_map(|(branch_prev, branch_c_pos)| {
|
||||
build_branches(content, &repeat_re, *branch_c_pos)
|
||||
.into_iter()
|
||||
.map(move |(branch_x, branch_x_c_pos)| {
|
||||
let branch_prev = branch_prev.clone();
|
||||
(
|
||||
Rc::new(move |exec: &mut Execution| {
|
||||
let res_prev = branch_prev(exec);
|
||||
let res_x = branch_x(exec);
|
||||
exec.ct_and(res_prev, res_x)
|
||||
}) as LazyExecution,
|
||||
branch_x_c_pos,
|
||||
)
|
||||
})
|
||||
})
|
||||
.collect(),
|
||||
);
|
||||
}
|
||||
res.into_iter().flatten().collect()
|
||||
}
|
||||
RegExpr::Optional { opt_re } => {
|
||||
let mut res = build_branches(content, &opt_re, c_pos);
|
||||
res.push((Rc::new(|exec| exec.ct_true()), c_pos));
|
||||
res
|
||||
}
|
||||
RegExpr::Seq { re_xs } => re_xs[1..].iter().fold(
|
||||
build_branches(content, &re_xs[0], c_pos),
|
||||
|continuations, re_x| {
|
||||
continuations
|
||||
.into_iter()
|
||||
.flat_map(|(branch_prev, branch_prev_c_pos)| {
|
||||
build_branches(content, re_x, branch_prev_c_pos)
|
||||
.into_iter()
|
||||
.map(move |(branch_x, branch_x_c_pos)| {
|
||||
let branch_prev = branch_prev.clone();
|
||||
(
|
||||
Rc::new(move |exec: &mut Execution| {
|
||||
let res_prev = branch_prev(exec);
|
||||
let res_x = branch_x(exec);
|
||||
exec.ct_and(res_prev, res_x)
|
||||
}) as LazyExecution,
|
||||
branch_x_c_pos,
|
||||
)
|
||||
})
|
||||
})
|
||||
.collect()
|
||||
},
|
||||
),
|
||||
_ => panic!("unmatched regex variant"),
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::engine::has_match;
|
||||
use test_case::test_case;
|
||||
|
||||
use tfhe::integer::{ServerKey, RadixClientKey};
|
||||
use crate::ciphertext::{encrypt_str, gen_keys, StringCiphertext};
|
||||
use bincode;
|
||||
use lazy_static::lazy_static;
|
||||
use std::io::Write;
|
||||
|
||||
lazy_static! {
|
||||
pub static ref KEYS: (RadixClientKey, ServerKey) = gen_keys();
|
||||
}
|
||||
|
||||
#[test_case("ab", "/ab/", 1)]
|
||||
#[test_case("b", "/ab/", 0)]
|
||||
#[test_case("ab", "/a?b/", 1)]
|
||||
#[test_case("b", "/a?b/", 1)]
|
||||
#[test_case("ab", "/^ab|cd$/", 1)]
|
||||
#[test_case(" ab", "/^ab|cd$/", 0)]
|
||||
#[test_case(" cd", "/^ab|cd$/", 0)]
|
||||
#[test_case("cd", "/^ab|cd$/", 1)]
|
||||
#[test_case("abcd", "/^ab|cd$/", 0)]
|
||||
#[test_case("abcd", "/ab|cd$/", 1)]
|
||||
#[test_case("abc", "/abc/", 1)]
|
||||
#[test_case("123abc", "/abc/", 1)]
|
||||
#[test_case("123abc456", "/abc/", 1)]
|
||||
#[test_case("123abdc456", "/abc/", 0)]
|
||||
#[test_case("abc456", "/abc/", 1)]
|
||||
#[test_case("bc", "/a*bc/", 1)]
|
||||
#[test_case("cdaabc", "/a*bc/", 1)]
|
||||
#[test_case("cdbc", "/a+bc/", 0)]
|
||||
#[test_case("bc", "/a+bc/", 0)]
|
||||
#[test_case("Ab", "/ab/i", 1 ; "ab case insensitive")]
|
||||
#[test_case("Ab", "/ab/", 0 ; "ab case sensitive")]
|
||||
#[test_case("cD", "/ab|cd/i", 1)]
|
||||
#[test_case("cD", "/cD/", 1)]
|
||||
#[test_case("test a num 8", "/8/", 1)]
|
||||
#[test_case("test a num 8", "/^8/", 0)]
|
||||
#[test_case("4453", "/^[0-9]*$/", 1)]
|
||||
#[test_case("4453", "/^[09]*$/", 0)]
|
||||
#[test_case("09009", "/^[09]*$/", 1)]
|
||||
#[test_case("de", "/^ab|cd|de$/", 1 ; "multiple or")]
|
||||
#[test_case(" de", "/^ab|cd|de$/", 0 ; "multiple or nests below ^")]
|
||||
fn test_has_match(content: &str, pattern: &str, exp: u64) {
|
||||
let ct_content: StringCiphertext = encrypt_str(&KEYS.0, content).unwrap();
|
||||
let ct_res = has_match(&KEYS.1, &ct_content, pattern).unwrap();
|
||||
|
||||
let got = KEYS.0.decrypt(&ct_res);
|
||||
assert_eq!(exp, got);
|
||||
}
|
||||
}
|
||||
275
tfhe/examples/regex_engine/execution.rs
Normal file
275
tfhe/examples/regex_engine/execution.rs
Normal file
@@ -0,0 +1,275 @@
|
||||
use std::collections::HashMap;
|
||||
use std::rc::Rc;
|
||||
use tfhe::integer::{RadixCiphertextBig, ServerKey};
|
||||
|
||||
use crate::parser::u8_to_char;
|
||||
|
||||
#[derive(Clone, PartialEq, Eq, Hash)]
|
||||
pub(crate) enum Executed {
|
||||
Constant { c: u8 },
|
||||
CtPos { at: usize },
|
||||
And { a: Box<Executed>, b: Box<Executed> },
|
||||
Or { a: Box<Executed>, b: Box<Executed> },
|
||||
Equal { a: Box<Executed>, b: Box<Executed> },
|
||||
GreaterOrEqual { a: Box<Executed>, b: Box<Executed> },
|
||||
LessOrEqual { a: Box<Executed>, b: Box<Executed> },
|
||||
Not { a: Box<Executed> },
|
||||
}
|
||||
type ExecutedResult = (RadixCiphertextBig, Executed);
|
||||
|
||||
impl Executed {
|
||||
pub(crate) fn ct_pos(at: usize) -> Self {
|
||||
Executed::CtPos { at }
|
||||
}
|
||||
|
||||
fn get_trivial_constant(&self) -> Option<u8> {
|
||||
match self {
|
||||
Self::Constant { c } => Some(*c),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const CT_FALSE: u8 = 0;
|
||||
const CT_TRUE: u8 = 1;
|
||||
|
||||
pub(crate) struct Execution {
|
||||
sk: ServerKey,
|
||||
cache: HashMap<Executed, RadixCiphertextBig>,
|
||||
|
||||
ct_ops: usize,
|
||||
cache_hits: usize,
|
||||
}
|
||||
pub(crate) type LazyExecution = Rc<dyn Fn(&mut Execution) -> ExecutedResult>;
|
||||
|
||||
impl Execution {
|
||||
pub(crate) fn new(sk: ServerKey) -> Self {
|
||||
Self {
|
||||
sk,
|
||||
cache: HashMap::new(),
|
||||
ct_ops: 0,
|
||||
cache_hits: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn ct_operations_count(&self) -> usize {
|
||||
self.ct_ops
|
||||
}
|
||||
|
||||
pub(crate) fn cache_hits(&self) -> usize {
|
||||
self.cache_hits
|
||||
}
|
||||
|
||||
pub(crate) fn ct_eq(&mut self, a: ExecutedResult, b: ExecutedResult) -> ExecutedResult {
|
||||
let ctx = Executed::Equal {
|
||||
a: Box::new(a.1.clone()),
|
||||
b: Box::new(b.1.clone()),
|
||||
};
|
||||
self.with_cache(
|
||||
ctx.clone(),
|
||||
Rc::new(move |exec: &mut Execution| {
|
||||
exec.ct_ops += 1;
|
||||
|
||||
let mut ct_a = a.0.clone();
|
||||
let mut ct_b = b.0.clone();
|
||||
(exec.sk.smart_eq(&mut ct_a, &mut ct_b), ctx.clone())
|
||||
}),
|
||||
)
|
||||
}
|
||||
|
||||
pub(crate) fn ct_ge(&mut self, a: ExecutedResult, b: ExecutedResult) -> ExecutedResult {
|
||||
let ctx = Executed::GreaterOrEqual {
|
||||
a: Box::new(a.1.clone()),
|
||||
b: Box::new(b.1.clone()),
|
||||
};
|
||||
self.with_cache(
|
||||
ctx.clone(),
|
||||
Rc::new(move |exec| {
|
||||
exec.ct_ops += 1;
|
||||
|
||||
let mut ct_a = a.0.clone();
|
||||
let mut ct_b = b.0.clone();
|
||||
(exec.sk.smart_gt(&mut ct_a, &mut ct_b), ctx.clone())
|
||||
}),
|
||||
)
|
||||
}
|
||||
|
||||
pub(crate) fn ct_le(&mut self, a: ExecutedResult, b: ExecutedResult) -> ExecutedResult {
|
||||
let ctx = Executed::LessOrEqual {
|
||||
a: Box::new(a.1.clone()),
|
||||
b: Box::new(b.1.clone()),
|
||||
};
|
||||
self.with_cache(
|
||||
ctx.clone(),
|
||||
Rc::new(move |exec| {
|
||||
exec.ct_ops += 1;
|
||||
|
||||
let mut ct_a = a.0.clone();
|
||||
let mut ct_b = b.0.clone();
|
||||
(exec.sk.smart_le(&mut ct_a, &mut ct_b), ctx.clone())
|
||||
}),
|
||||
)
|
||||
}
|
||||
|
||||
pub(crate) fn ct_and(&mut self, a: ExecutedResult, b: ExecutedResult) -> ExecutedResult {
|
||||
let ctx = Executed::And {
|
||||
a: Box::new(a.1.clone()),
|
||||
b: Box::new(b.1.clone()),
|
||||
};
|
||||
|
||||
let c_a = a.1.get_trivial_constant();
|
||||
let c_b = b.1.get_trivial_constant();
|
||||
if c_a == Some(CT_TRUE) {
|
||||
return (b.0, ctx);
|
||||
}
|
||||
if c_a == Some(CT_FALSE) {
|
||||
return (a.0, ctx);
|
||||
}
|
||||
if c_b == Some(CT_TRUE) {
|
||||
return (a.0, ctx);
|
||||
}
|
||||
if c_b == Some(CT_FALSE) {
|
||||
return (b.0, ctx);
|
||||
}
|
||||
|
||||
self.with_cache(
|
||||
ctx.clone(),
|
||||
Rc::new(move |exec| {
|
||||
exec.ct_ops += 1;
|
||||
|
||||
let mut ct_a = a.0.clone();
|
||||
let mut ct_b = b.0.clone();
|
||||
(exec.sk.smart_bitand(&mut ct_a, &mut ct_b), ctx.clone())
|
||||
}),
|
||||
)
|
||||
}
|
||||
|
||||
pub(crate) fn ct_or(&mut self, a: ExecutedResult, b: ExecutedResult) -> ExecutedResult {
|
||||
let ctx = Executed::Or {
|
||||
a: Box::new(a.1.clone()),
|
||||
b: Box::new(b.1.clone()),
|
||||
};
|
||||
|
||||
let c_a = a.1.get_trivial_constant();
|
||||
let c_b = b.1.get_trivial_constant();
|
||||
if c_a == Some(CT_TRUE) {
|
||||
return (a.0, ctx);
|
||||
}
|
||||
if c_b == Some(CT_TRUE) {
|
||||
return (b.0, ctx);
|
||||
}
|
||||
if c_a == Some(CT_FALSE) && c_b == Some(CT_FALSE) {
|
||||
return (a.0, ctx);
|
||||
}
|
||||
|
||||
self.with_cache(
|
||||
ctx.clone(),
|
||||
Rc::new(move |exec| {
|
||||
exec.ct_ops += 1;
|
||||
|
||||
let mut ct_a = a.0.clone();
|
||||
let mut ct_b = b.0.clone();
|
||||
(exec.sk.smart_bitor(&mut ct_a, &mut ct_b), ctx.clone())
|
||||
}),
|
||||
)
|
||||
}
|
||||
|
||||
pub(crate) fn ct_not(&mut self, a: ExecutedResult) -> ExecutedResult {
|
||||
let ctx = Executed::Not {
|
||||
a: Box::new(a.1.clone()),
|
||||
};
|
||||
self.with_cache(
|
||||
ctx.clone(),
|
||||
Rc::new(move |exec| {
|
||||
exec.ct_ops += 1;
|
||||
|
||||
let mut ct_a = a.0.clone();
|
||||
let mut ct_b = exec.ct_constant(1).0;
|
||||
(
|
||||
exec.sk.smart_bitxor(&mut ct_a, &mut ct_b),
|
||||
ctx.clone(),
|
||||
)
|
||||
}),
|
||||
)
|
||||
}
|
||||
|
||||
pub(crate) fn ct_false(&self) -> ExecutedResult {
|
||||
self.ct_constant(CT_FALSE)
|
||||
}
|
||||
|
||||
pub(crate) fn ct_true(&self) -> ExecutedResult {
|
||||
self.ct_constant(CT_TRUE)
|
||||
}
|
||||
|
||||
pub(crate) fn ct_constant(&self, c: u8) -> ExecutedResult {
|
||||
(
|
||||
self.sk.create_trivial_radix(c as u64, 4),
|
||||
Executed::Constant { c },
|
||||
)
|
||||
}
|
||||
|
||||
fn with_cache(&mut self, ctx: Executed, f: LazyExecution) -> ExecutedResult {
|
||||
if let Some(res) = self.cache.get(&ctx) {
|
||||
trace!("cache hit: {:?}", &ctx);
|
||||
self.cache_hits += 1;
|
||||
return (res.clone(), ctx);
|
||||
}
|
||||
debug!("evaluation for: {:?}", &ctx);
|
||||
let res = f(self);
|
||||
self.cache.insert(ctx, res.0.clone());
|
||||
res
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for Executed {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
match self {
|
||||
Self::Constant { c } => match c {
|
||||
0 => write!(f, "f"),
|
||||
1 => write!(f, "t"),
|
||||
_ => write!(f, "{}", u8_to_char(*c)),
|
||||
},
|
||||
Self::CtPos { at } => write!(f, "ct_{}", at),
|
||||
Self::And { a, b } => {
|
||||
write!(f, "(")?;
|
||||
a.fmt(f)?;
|
||||
write!(f, "/\\")?;
|
||||
b.fmt(f)?;
|
||||
write!(f, ")")
|
||||
}
|
||||
Self::Or { a, b } => {
|
||||
write!(f, "(")?;
|
||||
a.fmt(f)?;
|
||||
write!(f, "\\/")?;
|
||||
b.fmt(f)?;
|
||||
write!(f, ")")
|
||||
}
|
||||
Self::Equal { a, b } => {
|
||||
write!(f, "(")?;
|
||||
a.fmt(f)?;
|
||||
write!(f, "==")?;
|
||||
b.fmt(f)?;
|
||||
write!(f, ")")
|
||||
}
|
||||
Self::GreaterOrEqual { a, b } => {
|
||||
write!(f, "(")?;
|
||||
a.fmt(f)?;
|
||||
write!(f, ">=")?;
|
||||
b.fmt(f)?;
|
||||
write!(f, ")")
|
||||
}
|
||||
Self::LessOrEqual { a, b } => {
|
||||
write!(f, "(")?;
|
||||
a.fmt(f)?;
|
||||
write!(f, "<=")?;
|
||||
b.fmt(f)?;
|
||||
write!(f, ")")
|
||||
}
|
||||
Self::Not { a } => {
|
||||
write!(f, "(!")?;
|
||||
a.fmt(f)?;
|
||||
write!(f, ")")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
30
tfhe/examples/regex_engine/main.rs
Normal file
30
tfhe/examples/regex_engine/main.rs
Normal file
@@ -0,0 +1,30 @@
|
||||
#[macro_use]
|
||||
extern crate log;
|
||||
|
||||
mod engine;
|
||||
mod ciphertext;
|
||||
mod execution;
|
||||
mod parser;
|
||||
|
||||
use std::env;
|
||||
use env_logger::Env;
|
||||
|
||||
fn main() {
|
||||
let env = Env::default().filter_or("RUST_LOG", "info");
|
||||
env_logger::init_from_env(env);
|
||||
|
||||
let args: Vec<String> = env::args().collect();
|
||||
let content = &args[1];
|
||||
let pattern = &args[2];
|
||||
|
||||
let (client_key, server_key) = ciphertext::gen_keys();
|
||||
let ct_content = ciphertext::encrypt_str(&client_key, content).unwrap();
|
||||
|
||||
let ct_res = engine::has_match(&server_key, &ct_content, pattern).unwrap();
|
||||
let res: u64 = client_key.decrypt(&ct_res);
|
||||
if res == 0 {
|
||||
println!("no match");
|
||||
} else {
|
||||
println!("match");
|
||||
}
|
||||
}
|
||||
694
tfhe/examples/regex_engine/parser.rs
Normal file
694
tfhe/examples/regex_engine/parser.rs
Normal file
@@ -0,0 +1,694 @@
|
||||
use anyhow::{anyhow, Result};
|
||||
use combine::parser::byte;
|
||||
use combine::parser::byte::byte;
|
||||
use combine::*;
|
||||
|
||||
use std::fmt;
|
||||
|
||||
#[derive(Clone, PartialEq, Eq, Hash)]
|
||||
pub(crate) enum RegExpr {
|
||||
SOF,
|
||||
EOF,
|
||||
Char {
|
||||
c: u8,
|
||||
},
|
||||
AnyChar,
|
||||
Between {
|
||||
from: u8,
|
||||
to: u8,
|
||||
},
|
||||
Range {
|
||||
cs: Vec<u8>,
|
||||
},
|
||||
Not {
|
||||
not_re: Box<RegExpr>,
|
||||
},
|
||||
Either {
|
||||
l_re: Box<RegExpr>,
|
||||
r_re: Box<RegExpr>,
|
||||
},
|
||||
Optional {
|
||||
opt_re: Box<RegExpr>,
|
||||
},
|
||||
Repeated {
|
||||
repeat_re: Box<RegExpr>,
|
||||
at_least: Option<usize>, // if None: no least limit, aka 0 times
|
||||
at_most: Option<usize>, // if None: no most limit
|
||||
},
|
||||
Seq {
|
||||
re_xs: Vec<RegExpr>,
|
||||
},
|
||||
}
|
||||
|
||||
impl RegExpr {
|
||||
fn case_insensitive(self) -> Self {
|
||||
match self {
|
||||
Self::Char { c } => Self::Range {
|
||||
cs: case_insensitive(c),
|
||||
},
|
||||
Self::Not { not_re } => Self::Not {
|
||||
not_re: Box::new(not_re.case_insensitive()),
|
||||
},
|
||||
Self::Either { l_re, r_re } => Self::Either {
|
||||
l_re: Box::new(l_re.case_insensitive()),
|
||||
r_re: Box::new(r_re.case_insensitive()),
|
||||
},
|
||||
Self::Optional { opt_re } => Self::Optional {
|
||||
opt_re: Box::new(opt_re.case_insensitive()),
|
||||
},
|
||||
Self::Repeated { repeat_re, at_least, at_most } => Self::Repeated {
|
||||
repeat_re: Box::new(repeat_re.case_insensitive()),
|
||||
at_least,
|
||||
at_most,
|
||||
},
|
||||
Self::Seq { re_xs } => Self::Seq {
|
||||
re_xs: re_xs.into_iter().map(|re| re.case_insensitive()).collect(),
|
||||
},
|
||||
_ => self,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn case_insensitive(x: u8) -> Vec<u8> {
|
||||
let c = u8_to_char(x);
|
||||
if c.is_ascii_lowercase() {
|
||||
return vec![x, c.to_ascii_uppercase() as u8];
|
||||
}
|
||||
if c.is_ascii_uppercase() {
|
||||
return vec![x, c.to_ascii_lowercase() as u8];
|
||||
}
|
||||
vec![x]
|
||||
}
|
||||
|
||||
pub(crate) fn u8_to_char(c: u8) -> char {
|
||||
char::from_u32(c as u32).unwrap()
|
||||
}
|
||||
|
||||
impl fmt::Debug for RegExpr {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
match self {
|
||||
Self::SOF => write!(f, "^"),
|
||||
Self::EOF => write!(f, "$"),
|
||||
Self::Char { c } => write!(f, "{}", u8_to_char(*c)),
|
||||
Self::AnyChar => write!(f, "."),
|
||||
Self::Not { not_re } => {
|
||||
write!(f, "[^")?;
|
||||
not_re.fmt(f)?;
|
||||
write!(f, "]")
|
||||
}
|
||||
Self::Between { from, to } => {
|
||||
write!(f, "[{}->{}]", u8_to_char(*from), u8_to_char(*to),)
|
||||
}
|
||||
Self::Range { cs } => write!(
|
||||
f,
|
||||
"[{}]",
|
||||
cs.iter().map(|c| u8_to_char(*c)).collect::<String>(),
|
||||
),
|
||||
Self::Either { l_re, r_re } => {
|
||||
write!(f, "(")?;
|
||||
l_re.fmt(f)?;
|
||||
write!(f, "|")?;
|
||||
r_re.fmt(f)?;
|
||||
write!(f, ")")
|
||||
}
|
||||
Self::Repeated {
|
||||
repeat_re,
|
||||
at_least,
|
||||
at_most,
|
||||
} => {
|
||||
let stringify_opt_n = |opt_n: &Option<usize>| -> String {
|
||||
opt_n.map_or("*".to_string(), |n| format!("{:?}", n))
|
||||
};
|
||||
repeat_re.fmt(f)?;
|
||||
write!(
|
||||
f,
|
||||
"{{{},{}}}",
|
||||
stringify_opt_n(at_least),
|
||||
stringify_opt_n(at_most)
|
||||
)
|
||||
}
|
||||
Self::Optional { opt_re } => {
|
||||
opt_re.fmt(f)?;
|
||||
write!(f, "?")
|
||||
}
|
||||
Self::Seq { re_xs } => {
|
||||
write!(f, "<")?;
|
||||
for re_x in re_xs {
|
||||
re_x.fmt(f)?;
|
||||
}
|
||||
write!(f, ">")?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn parse(pattern: &str) -> Result<RegExpr> {
|
||||
let (parsed, unparsed) = ((
|
||||
between(
|
||||
byte(b'/'),
|
||||
byte(b'/'),
|
||||
(optional(byte(b'^')), regex(), optional(byte(b'$'))),
|
||||
)
|
||||
.map(|(sof, re, eof)| {
|
||||
if sof.is_none() && eof.is_none() {
|
||||
return re;
|
||||
}
|
||||
let mut re_xs = vec![];
|
||||
if sof.is_some() {
|
||||
re_xs.push(RegExpr::SOF);
|
||||
}
|
||||
re_xs.push(re);
|
||||
if eof.is_some() {
|
||||
re_xs.push(RegExpr::EOF);
|
||||
}
|
||||
RegExpr::Seq { re_xs }
|
||||
}),
|
||||
optional(byte(b'i')),
|
||||
))
|
||||
.map(|(re, case_insensitive)| {
|
||||
if case_insensitive.is_some() {
|
||||
re.case_insensitive()
|
||||
} else {
|
||||
re
|
||||
}
|
||||
})
|
||||
.parse(pattern.as_bytes())?;
|
||||
if !unparsed.is_empty() {
|
||||
return Err(anyhow!(
|
||||
"failed to parse regular expression, unexpected token at start of: {}",
|
||||
std::str::from_utf8(unparsed).unwrap()
|
||||
));
|
||||
}
|
||||
|
||||
Ok(parsed)
|
||||
}
|
||||
|
||||
// based on grammar from: https://matt.might.net/articles/parsing-regex-with-recursive-descent/
|
||||
//
|
||||
// <regex> ::= <term> '|' <regex>
|
||||
// | <term>
|
||||
//
|
||||
// <term> ::= { <factor> }
|
||||
//
|
||||
// <factor> ::= <base> { '*' }
|
||||
//
|
||||
// <base> ::= <char>
|
||||
// | '\' <char>
|
||||
// | '(' <regex> ')'
|
||||
|
||||
parser! {
|
||||
fn regex[Input]()(Input) -> RegExpr
|
||||
where [Input: Stream<Token = u8>]
|
||||
{
|
||||
regex_()
|
||||
}
|
||||
}
|
||||
|
||||
fn regex_<Input>() -> impl Parser<Input, Output = RegExpr>
|
||||
where
|
||||
Input: Stream<Token = u8>,
|
||||
Input::Error: ParseError<Input::Token, Input::Range, Input::Position>,
|
||||
{
|
||||
choice((
|
||||
attempt(
|
||||
(term(), byte(b'|'), regex()).map(|(l_re, _, r_re)| RegExpr::Either {
|
||||
l_re: Box::new(l_re),
|
||||
r_re: Box::new(r_re),
|
||||
}),
|
||||
),
|
||||
term(),
|
||||
))
|
||||
}
|
||||
|
||||
fn term<Input>() -> impl Parser<Input, Output = RegExpr>
|
||||
where
|
||||
Input: Stream<Token = u8>,
|
||||
Input::Error: ParseError<Input::Token, Input::Range, Input::Position>,
|
||||
{
|
||||
many(factor()).map(|re_xs: Vec<RegExpr>| {
|
||||
if re_xs.len() == 1 {
|
||||
re_xs[0].clone()
|
||||
} else {
|
||||
RegExpr::Seq { re_xs }
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
fn factor<Input>() -> impl Parser<Input, Output = RegExpr>
|
||||
where
|
||||
Input: Stream<Token = u8>,
|
||||
Input::Error: ParseError<Input::Token, Input::Range, Input::Position>,
|
||||
{
|
||||
choice((
|
||||
attempt((atom(), byte(b'?'))).map(|(re, _)| RegExpr::Optional {
|
||||
opt_re: Box::new(re),
|
||||
}),
|
||||
attempt(repeated()),
|
||||
atom(),
|
||||
))
|
||||
}
|
||||
|
||||
const NON_ESCAPABLE_SYMBOLS: [u8; 14] = [
|
||||
b'&', b';', b':', b',', b'`', b'~', b'-', b'_', b'!', b'@', b'#', b'%', b'\'', b'\"',
|
||||
];
|
||||
|
||||
fn atom<Input>() -> impl Parser<Input, Output = RegExpr>
|
||||
where
|
||||
Input: Stream<Token = u8>,
|
||||
Input::Error: ParseError<Input::Token, Input::Range, Input::Position>,
|
||||
{
|
||||
choice((
|
||||
byte(b'.').map(|_| RegExpr::AnyChar),
|
||||
attempt(byte(b'\\').with(parser::token::any())).map(|c| RegExpr::Char { c }),
|
||||
choice((byte::alpha_num(), parser::token::one_of(NON_ESCAPABLE_SYMBOLS)))
|
||||
.map(|c| RegExpr::Char { c }),
|
||||
between(byte(b'['), byte(b']'), range()),
|
||||
between(byte(b'('), byte(b')'), regex()),
|
||||
))
|
||||
}
|
||||
|
||||
parser! {
|
||||
fn range[Input]()(Input) -> RegExpr
|
||||
where [Input: Stream<Token = u8>]
|
||||
{
|
||||
range_()
|
||||
}
|
||||
}
|
||||
|
||||
fn range_<Input>() -> impl Parser<Input, Output = RegExpr>
|
||||
where
|
||||
Input: Stream<Token = u8>,
|
||||
Input::Error: ParseError<Input::Token, Input::Range, Input::Position>,
|
||||
{
|
||||
choice((
|
||||
byte(b'^').with(range()).map(|re| RegExpr::Not {
|
||||
not_re: Box::new(re),
|
||||
}),
|
||||
attempt(
|
||||
(byte::alpha_num(), byte(b'-'), byte::alpha_num())
|
||||
.map(|(from, _, to)| RegExpr::Between { from, to }),
|
||||
),
|
||||
many1(byte::alpha_num()).map(|cs| RegExpr::Range { cs }),
|
||||
))
|
||||
}
|
||||
|
||||
fn repeated<Input>() -> impl Parser<Input, Output = RegExpr>
|
||||
where
|
||||
Input: Stream<Token = u8>,
|
||||
Input::Error: ParseError<Input::Token, Input::Range, Input::Position>,
|
||||
{
|
||||
choice((
|
||||
attempt((atom(), choice((byte(b'*'), byte(b'+'))))).map(|(re, c)| RegExpr::Repeated {
|
||||
repeat_re: Box::new(re),
|
||||
at_least: if c == b'*' { None } else { Some(1) },
|
||||
at_most: None,
|
||||
}),
|
||||
attempt((
|
||||
atom(),
|
||||
between(byte(b'{'), byte(b'}'), many::<Vec<u8>, _, _>(byte::digit())),
|
||||
))
|
||||
.map(|(re, repeat_digits)| {
|
||||
let repeat = parse_digits(&repeat_digits);
|
||||
RegExpr::Repeated {
|
||||
repeat_re: Box::new(re),
|
||||
at_least: Some(repeat),
|
||||
at_most: Some(repeat),
|
||||
}
|
||||
}),
|
||||
(
|
||||
atom(),
|
||||
between(
|
||||
byte(b'{'),
|
||||
byte(b'}'),
|
||||
(
|
||||
many::<Vec<u8>, _, _>(byte::digit()),
|
||||
byte(b','),
|
||||
many::<Vec<u8>, _, _>(byte::digit()),
|
||||
),
|
||||
),
|
||||
)
|
||||
.map(
|
||||
|(re, (at_least_digits, _, at_most_digits))| RegExpr::Repeated {
|
||||
repeat_re: Box::new(re),
|
||||
at_least: if at_least_digits.len() == 0 {
|
||||
None
|
||||
} else {
|
||||
Some(parse_digits(&at_least_digits))
|
||||
},
|
||||
at_most: if at_most_digits.len() == 0 {
|
||||
None
|
||||
} else {
|
||||
Some(parse_digits(&at_most_digits))
|
||||
},
|
||||
},
|
||||
),
|
||||
))
|
||||
}
|
||||
|
||||
fn parse_digits(digits: &[u8]) -> usize {
|
||||
std::str::from_utf8(digits).unwrap().parse().unwrap()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::parser::{parse, RegExpr};
|
||||
use test_case::test_case;
|
||||
|
||||
#[test_case("/h/", RegExpr::Char { c: b'h' }; "char")]
|
||||
#[test_case("/&/", RegExpr::Char { c: b'&' }; "not necessary to escape ampersand")]
|
||||
#[test_case("/;/", RegExpr::Char { c: b';' }; "not necessary to escape semicolon")]
|
||||
#[test_case("/:/", RegExpr::Char { c: b':' }; "not necessary to escape colon")]
|
||||
#[test_case("/,/", RegExpr::Char { c: b',' }; "not necessary to escape comma")]
|
||||
#[test_case("/`/", RegExpr::Char { c: b'`' }; "not necessary to escape backtick")]
|
||||
#[test_case("/~/", RegExpr::Char { c: b'~' }; "not necessary to escape tilde")]
|
||||
#[test_case("/-/", RegExpr::Char { c: b'-' }; "not necessary to escape minus")]
|
||||
#[test_case("/_/", RegExpr::Char { c: b'_' }; "not necessary to escape underscore")]
|
||||
#[test_case("/%/", RegExpr::Char { c: b'%' }; "not necessary to escape percentage")]
|
||||
#[test_case("/#/", RegExpr::Char { c: b'#' }; "not necessary to escape hashtag")]
|
||||
#[test_case("/@/", RegExpr::Char { c: b'@' }; "not necessary to escape at")]
|
||||
#[test_case("/!/", RegExpr::Char { c: b'!' }; "not necessary to escape exclamation")]
|
||||
#[test_case("/'/", RegExpr::Char { c: b'\'' }; "not necessary to escape single quote")]
|
||||
#[test_case("/\"/", RegExpr::Char { c: b'\"' }; "not necessary to escape double quote")]
|
||||
#[test_case("/\\h/", RegExpr::Char { c: b'h' }; "anything can be escaped")]
|
||||
#[test_case("/./", RegExpr::AnyChar; "any")]
|
||||
#[test_case("/abc/",
|
||||
RegExpr::Seq {re_xs: vec![
|
||||
RegExpr::Char { c: b'a' },
|
||||
RegExpr::Char { c: b'b' },
|
||||
RegExpr::Char { c: b'c' },
|
||||
]};
|
||||
"abc")]
|
||||
#[test_case("/^abc/",
|
||||
RegExpr::Seq {re_xs: vec![
|
||||
RegExpr::SOF,
|
||||
RegExpr::Seq {re_xs: vec![
|
||||
RegExpr::Char { c: b'a' },
|
||||
RegExpr::Char { c: b'b' },
|
||||
RegExpr::Char { c: b'c' },
|
||||
]},
|
||||
]};
|
||||
"<sof>abc")]
|
||||
#[test_case("/abc$/",
|
||||
RegExpr::Seq {re_xs: vec![
|
||||
RegExpr::Seq {re_xs: vec![
|
||||
RegExpr::Char { c: b'a' },
|
||||
RegExpr::Char { c: b'b' },
|
||||
RegExpr::Char { c: b'c' },
|
||||
]},
|
||||
RegExpr::EOF,
|
||||
]};
|
||||
"abc<eof>")]
|
||||
#[test_case("/^abc$/",
|
||||
RegExpr::Seq {re_xs: vec![
|
||||
RegExpr::SOF,
|
||||
RegExpr::Seq {re_xs: vec![
|
||||
RegExpr::Char { c: b'a' },
|
||||
RegExpr::Char { c: b'b' },
|
||||
RegExpr::Char { c: b'c' },
|
||||
]},
|
||||
RegExpr::EOF,
|
||||
]};
|
||||
"<sof>abc<eof>")]
|
||||
#[test_case("/^ab?c$/",
|
||||
RegExpr::Seq {re_xs: vec![
|
||||
RegExpr::SOF,
|
||||
RegExpr::Seq {re_xs: vec![
|
||||
RegExpr::Char { c: b'a' },
|
||||
RegExpr::Optional { opt_re: Box::new(RegExpr::Char { c: b'b' }) },
|
||||
RegExpr::Char { c: b'c' },
|
||||
]},
|
||||
RegExpr::EOF,
|
||||
]};
|
||||
"<sof>ab<question>c<eof>")]
|
||||
#[test_case("/^ab*c$/",
|
||||
RegExpr::Seq {re_xs: vec![
|
||||
RegExpr::SOF,
|
||||
RegExpr::Seq {re_xs: vec![
|
||||
RegExpr::Char { c: b'a' },
|
||||
RegExpr::Repeated {
|
||||
repeat_re: Box::new(RegExpr::Char { c: b'b' }),
|
||||
at_least: None,
|
||||
at_most: None,
|
||||
},
|
||||
RegExpr::Char { c: b'c' },
|
||||
]},
|
||||
RegExpr::EOF,
|
||||
]};
|
||||
"<sof>ab<star>c<eof>")]
|
||||
#[test_case("/^ab+c$/",
|
||||
RegExpr::Seq {re_xs: vec![
|
||||
RegExpr::SOF,
|
||||
RegExpr::Seq {re_xs: vec![
|
||||
RegExpr::Char { c: b'a' },
|
||||
RegExpr::Repeated {
|
||||
repeat_re: Box::new(RegExpr::Char { c: b'b' }),
|
||||
at_least: Some(1),
|
||||
at_most: None,
|
||||
},
|
||||
RegExpr::Char { c: b'c' },
|
||||
]},
|
||||
RegExpr::EOF,
|
||||
]};
|
||||
"<sof>ab<plus>c<eof>")]
|
||||
#[test_case("/^ab{2}c$/",
|
||||
RegExpr::Seq {re_xs: vec![
|
||||
RegExpr::SOF,
|
||||
RegExpr::Seq {re_xs: vec![
|
||||
RegExpr::Char { c: b'a' },
|
||||
RegExpr::Repeated {
|
||||
repeat_re: Box::new(RegExpr::Char { c: b'b' }),
|
||||
at_least: Some(2),
|
||||
at_most: Some(2),
|
||||
},
|
||||
RegExpr::Char { c: b'c' },
|
||||
]},
|
||||
RegExpr::EOF,
|
||||
]};
|
||||
"<sof>ab<twice>c<eof>")]
|
||||
#[test_case("/^ab{3,}c$/",
|
||||
RegExpr::Seq {re_xs: vec![
|
||||
RegExpr::SOF,
|
||||
RegExpr::Seq {re_xs: vec![
|
||||
RegExpr::Char { c: b'a' },
|
||||
RegExpr::Repeated {
|
||||
repeat_re: Box::new(RegExpr::Char { c: b'b' }),
|
||||
at_least: Some(3),
|
||||
at_most: None,
|
||||
},
|
||||
RegExpr::Char { c: b'c' },
|
||||
]},
|
||||
RegExpr::EOF,
|
||||
]};
|
||||
"<sof>ab<atleast 3>c<eof>")]
|
||||
#[test_case("/^ab{2,4}c$/",
|
||||
RegExpr::Seq {re_xs: vec![
|
||||
RegExpr::SOF,
|
||||
RegExpr::Seq {re_xs: vec![
|
||||
RegExpr::Char { c: b'a' },
|
||||
RegExpr::Repeated {
|
||||
repeat_re: Box::new(RegExpr::Char { c: b'b' }),
|
||||
at_least: Some(2),
|
||||
at_most: Some(4),
|
||||
},
|
||||
RegExpr::Char { c: b'c' },
|
||||
]},
|
||||
RegExpr::EOF,
|
||||
]};
|
||||
"<sof>ab<between 2 and 4>c<eof>")]
|
||||
#[test_case("/^.$/",
|
||||
RegExpr::Seq {re_xs: vec![
|
||||
RegExpr::SOF,
|
||||
RegExpr::AnyChar,
|
||||
RegExpr::EOF,
|
||||
]};
|
||||
"<sof><any><eof>")]
|
||||
#[test_case("/^[abc]$/",
|
||||
RegExpr::Seq {re_xs: vec![
|
||||
RegExpr::SOF,
|
||||
RegExpr::Range { cs: vec![b'a', b'b', b'c'] },
|
||||
RegExpr::EOF,
|
||||
]};
|
||||
"<sof><a or b or c><eof>")]
|
||||
#[test_case("/^[a-d]$/",
|
||||
RegExpr::Seq {re_xs: vec![
|
||||
RegExpr::SOF,
|
||||
RegExpr::Between { from: b'a', to: b'd' },
|
||||
RegExpr::EOF,
|
||||
]};
|
||||
"<sof><between a and d><eof>")]
|
||||
#[test_case("/^[^abc]$/",
|
||||
RegExpr::Seq {re_xs: vec![
|
||||
RegExpr::SOF,
|
||||
RegExpr::Not { not_re: Box::new(RegExpr::Range { cs: vec![b'a', b'b', b'c'] })},
|
||||
RegExpr::EOF,
|
||||
]};
|
||||
"<sof><not <a or b or c>><eof>")]
|
||||
#[test_case("/^[^a-d]$/",
|
||||
RegExpr::Seq {re_xs: vec![
|
||||
RegExpr::SOF,
|
||||
RegExpr::Not { not_re: Box::new(RegExpr::Between { from: b'a', to: b'd' }) },
|
||||
RegExpr::EOF,
|
||||
]};
|
||||
"<sof><not <between a and d>><eof>")]
|
||||
#[test_case("/^abc$/i",
|
||||
RegExpr::Seq {re_xs: vec![
|
||||
RegExpr::SOF,
|
||||
RegExpr::Seq {re_xs: vec![
|
||||
RegExpr::Range { cs: vec![b'a', b'A'] },
|
||||
RegExpr::Range { cs: vec![b'b', b'B'] },
|
||||
RegExpr::Range { cs: vec![b'c', b'C'] },
|
||||
]},
|
||||
RegExpr::EOF,
|
||||
]};
|
||||
"<sof>abc<eof> (case insensitive)")]
|
||||
#[test_case("/^/",
|
||||
RegExpr::Seq {re_xs: vec![
|
||||
RegExpr::SOF,
|
||||
RegExpr::Seq { re_xs: vec![] }
|
||||
]};
|
||||
"sof")]
|
||||
#[test_case("/$/",
|
||||
RegExpr::Seq {re_xs: vec![
|
||||
RegExpr::Seq { re_xs: vec![] },
|
||||
RegExpr::EOF
|
||||
]};
|
||||
"eof")]
|
||||
#[test_case("/a*/",
|
||||
RegExpr::Repeated {
|
||||
repeat_re: Box::new(RegExpr::Char { c: b'a' }),
|
||||
at_least: None,
|
||||
at_most: None,
|
||||
};
|
||||
"repeat unbounded (w/ *)")]
|
||||
#[test_case("/a+/",
|
||||
RegExpr::Repeated {
|
||||
repeat_re: Box::new(RegExpr::Char { c: b'a' }),
|
||||
at_least: Some(1),
|
||||
at_most: None,
|
||||
};
|
||||
"repeat bounded at least (w/ +)")]
|
||||
#[test_case("/a{104,}/",
|
||||
RegExpr::Repeated {
|
||||
repeat_re: Box::new(RegExpr::Char { c: b'a' }),
|
||||
at_least: Some(104),
|
||||
at_most: None,
|
||||
};
|
||||
"repeat bounded at least (w/ {x,}")]
|
||||
#[test_case("/a{,15}/",
|
||||
RegExpr::Repeated {
|
||||
repeat_re: Box::new(RegExpr::Char { c: b'a' }),
|
||||
at_least: None,
|
||||
at_most: Some(15),
|
||||
};
|
||||
"repeat bounded at most (w/ {,x}")]
|
||||
#[test_case("/a{12,15}/",
|
||||
RegExpr::Repeated {
|
||||
repeat_re: Box::new(RegExpr::Char { c: b'a' }),
|
||||
at_least: Some(12),
|
||||
at_most: Some(15),
|
||||
};
|
||||
"repeat bounded at least and at most (w/ {x,y}")]
|
||||
#[test_case("/(a|b)*/",
|
||||
RegExpr::Repeated {
|
||||
repeat_re: Box::new(RegExpr::Either {
|
||||
l_re: Box::new(RegExpr::Char { c: b'a' }),
|
||||
r_re: Box::new(RegExpr::Char { c: b'b' }),
|
||||
}),
|
||||
at_least: None,
|
||||
at_most: None,
|
||||
};
|
||||
"repeat complex unbounded")]
|
||||
#[test_case("/(a|b){3,7}/",
|
||||
RegExpr::Repeated {
|
||||
repeat_re: Box::new(RegExpr::Either {
|
||||
l_re: Box::new(RegExpr::Char { c: b'a' }),
|
||||
r_re: Box::new(RegExpr::Char { c: b'b' }),
|
||||
}),
|
||||
at_least: Some(3),
|
||||
at_most: Some(7),
|
||||
};
|
||||
"repeat complex bounded")]
|
||||
#[test_case("/^ab|cd/",
|
||||
RegExpr::Seq { re_xs: vec![
|
||||
RegExpr::SOF,
|
||||
RegExpr::Either {
|
||||
l_re: Box::new(RegExpr::Seq { re_xs: vec![
|
||||
RegExpr::Char { c: b'a' },
|
||||
RegExpr::Char { c: b'b' },
|
||||
] }),
|
||||
r_re: Box::new(RegExpr::Seq { re_xs: vec![
|
||||
RegExpr::Char { c: b'c' },
|
||||
RegExpr::Char { c: b'd' },
|
||||
]}),
|
||||
},
|
||||
]};
|
||||
"SOF encapsulates full RHS")]
|
||||
#[test_case("/ab|cd$/",
|
||||
RegExpr::Seq {re_xs: vec![
|
||||
RegExpr::Either {
|
||||
l_re: Box::new(RegExpr::Seq {re_xs: vec![
|
||||
RegExpr::Char { c: b'a' },
|
||||
RegExpr::Char { c: b'b' },
|
||||
]}),
|
||||
r_re: Box::new(RegExpr::Seq {re_xs: vec![
|
||||
RegExpr::Char { c: b'c' },
|
||||
RegExpr::Char { c: b'd' },
|
||||
]}),
|
||||
},
|
||||
RegExpr::EOF,
|
||||
]};
|
||||
"EOF encapsulates full RHS" )]
|
||||
#[test_case("/^ab|cd$/",
|
||||
RegExpr::Seq {re_xs: vec![
|
||||
RegExpr::SOF,
|
||||
RegExpr::Either {
|
||||
l_re: Box::new(RegExpr::Seq {re_xs: vec![
|
||||
RegExpr::Char { c: b'a' },
|
||||
RegExpr::Char { c: b'b' },
|
||||
]}),
|
||||
r_re: Box::new(RegExpr::Seq {re_xs: vec![
|
||||
RegExpr::Char { c: b'c' },
|
||||
RegExpr::Char { c: b'd' },
|
||||
]}),
|
||||
},
|
||||
RegExpr::EOF,
|
||||
]};
|
||||
"SOF + EOF both encapsulate full center")]
|
||||
#[test_case("/\\^/",
|
||||
RegExpr::Char { c: b'^' };
|
||||
"escaping sof symbol")]
|
||||
#[test_case("/\\./",
|
||||
RegExpr::Char { c: b'.' };
|
||||
"escaping period symbol")]
|
||||
#[test_case("/\\*/",
|
||||
RegExpr::Char { c: b'*' };
|
||||
"escaping star symbol")]
|
||||
#[test_case("/^ca\\^b$/",
|
||||
RegExpr::Seq {re_xs: vec![
|
||||
RegExpr::SOF,
|
||||
RegExpr::Seq {re_xs: vec![
|
||||
RegExpr::Char { c: b'c' },
|
||||
RegExpr::Char { c: b'a' },
|
||||
RegExpr::Char { c: b'^' },
|
||||
RegExpr::Char { c: b'b' },
|
||||
]},
|
||||
RegExpr::EOF,
|
||||
]};
|
||||
"escaping, more realistic")]
|
||||
#[test_case("/8/",
|
||||
RegExpr::Char { c: b'8' };
|
||||
"able to match numbers")]
|
||||
#[test_case("/[7-9]/",
|
||||
RegExpr::Between { from: b'7', to: b'9' };
|
||||
"able to match a number range")]
|
||||
#[test_case("/[79]/",
|
||||
RegExpr::Range { cs: vec![b'7', b'9'] };
|
||||
"able to match a number range (part 2)")]
|
||||
fn test_parser(pattern: &str, exp: RegExpr) {
|
||||
match parse(pattern) {
|
||||
Ok(got) => assert_eq!(exp, got),
|
||||
Err(e) => panic!("got err: {}", e),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -38,6 +38,9 @@ pub mod integer;
|
||||
/// cbindgen:ignore
|
||||
pub mod shortint;
|
||||
|
||||
#[cfg(feature = "regex")]
|
||||
pub mod regex;
|
||||
|
||||
#[cfg(feature = "__wasm_api")]
|
||||
/// cbindgen:ignore
|
||||
pub mod js_on_wasm_api;
|
||||
|
||||
Reference in New Issue
Block a user