libexpr: Replace regex engine with boost::regex
This avoids C++'s standard library regexes, which aren't the same across platforms, and have many other issues, like using stack so much that they stack overflow when processing a lot of data. To avoid backwards and forward compatibility issues, regexes are processed using a function converting libstdc++ regexes into Boost regexes, escaping characters that Boost needs to have escaped, and rejecting features that Boost has and libstdc++ doesn't. Related context: - Original failed attempt to use `boost::regex` in CppNix, failed due to boost icu dependency being large (disabling ICU is no longer necessary because linking ICU requires using a different header file, `boost/regex/icu.hpp`): https://github.com/NixOS/nix/pull/3826 - An attempt to use PCRE, rejected due to providing less backwards compatibility with `std::regex` than `boost::regex`: https://github.com/NixOS/nix/pull/7336 - Second attempt to use `boost::regex`, failed due to `}` regex failing to compile (dealt with by writing a wrapper that parses a regular expression and escapes `}` characters): https://github.com/NixOS/nix/pull/7762 Closes #34. Closes #476. Change-Id: Ieb0eb9e270a93e4c7eed412ba4f9f96cb00a5fa4
This commit is contained in:
parent
e727dbc3a3
commit
447212fa65
4 changed files with 384 additions and 12 deletions
|
@ -129,6 +129,10 @@ roberth:
|
|||
display_name: Robert Hensing
|
||||
github: roberth
|
||||
|
||||
sugar:
|
||||
forgejo: sugar
|
||||
github: sugar700
|
||||
|
||||
thufschmitt:
|
||||
display_name: Théophane Hufschmitt
|
||||
github: thufschmitt
|
||||
|
|
37
doc/manual/rl-next/boost-regex.md
Normal file
37
doc/manual/rl-next/boost-regex.md
Normal file
|
@ -0,0 +1,37 @@
|
|||
---
|
||||
synopsis: Replace regex engine with boost::regex
|
||||
issues: [fj#34, fj#476]
|
||||
cls: [1821]
|
||||
category: Fixes
|
||||
credits: [sugar]
|
||||
---
|
||||
|
||||
Previously, the C++ standard regex expression library was used, the
|
||||
behaviour of which varied depending on the platform. This has been
|
||||
replaced with the Boost regex library, which works identically across
|
||||
platforms.
|
||||
|
||||
The visible behaviour of the regex functions doesn't change. While
|
||||
the new library has more features, Lix will reject regular expressions
|
||||
using them.
|
||||
|
||||
This also fixes regex matching reporting stack overflow when matching
|
||||
on too much data.
|
||||
|
||||
Before:
|
||||
|
||||
nix-repl> builtins.match ".*" (
|
||||
builtins.concatStringsSep "" (
|
||||
builtins.genList (_: "a") 1000000
|
||||
)
|
||||
)
|
||||
error: stack overflow (possible infinite recursion)
|
||||
|
||||
After:
|
||||
|
||||
nix-repl> builtins.match ".*" (
|
||||
builtins.concatStringsSep "" (
|
||||
builtins.genList (_: "a") 1000000
|
||||
)
|
||||
)
|
||||
[ ]
|
|
@ -17,6 +17,7 @@
|
|||
#include "fetch-to-store.hh"
|
||||
|
||||
#include <boost/container/small_vector.hpp>
|
||||
#include <boost/regex.hpp>
|
||||
#include <nlohmann/json.hpp>
|
||||
|
||||
#include <sys/types.h>
|
||||
|
@ -26,7 +27,6 @@
|
|||
#include <algorithm>
|
||||
#include <cstring>
|
||||
#include <sstream>
|
||||
#include <regex>
|
||||
#include <dlfcn.h>
|
||||
|
||||
#include <cmath>
|
||||
|
@ -3878,19 +3878,205 @@ static RegisterPrimOp primop_hashString({
|
|||
.fun = prim_hashString,
|
||||
});
|
||||
|
||||
enum class RegexParseState {
|
||||
// Anything outside of those
|
||||
Regular,
|
||||
|
||||
// Bounded repeats, `}` shouldn't be escaped in those
|
||||
//
|
||||
// a{2,5}b
|
||||
// ^^^^
|
||||
BoundedRepeat,
|
||||
|
||||
// Backslashes, as C++ regexes only support escaping what needs to be
|
||||
// escaped and nothing else
|
||||
//
|
||||
// a\nb
|
||||
// ^
|
||||
Backslash,
|
||||
|
||||
// Initial part of character set, as `[]]` is a regex for `]` character
|
||||
//
|
||||
// [abc] [^abc]
|
||||
// ^ ^
|
||||
CharacterSetStart,
|
||||
|
||||
// Initial part of negated character set, as `[^]]` is a regex for
|
||||
// anything but `]` character
|
||||
//
|
||||
// [^abc]
|
||||
// ^
|
||||
NegatedCharacterSetStart,
|
||||
|
||||
// Character set after its first character
|
||||
//
|
||||
// [abc]
|
||||
// ^^
|
||||
CharacterSetMiddle,
|
||||
|
||||
// Parser state after seeing [, assumes the input is character extension
|
||||
// after seeing `:`, `.`, or `=`
|
||||
//
|
||||
// [a[:alpha:]b]
|
||||
// ^
|
||||
PossibleCharacterSetExtension,
|
||||
|
||||
// Within character extension
|
||||
//
|
||||
// [a[:alpha:]b]
|
||||
// ^^^^^^^
|
||||
CharacterSetExtension,
|
||||
|
||||
// Within equivalence class expression
|
||||
//
|
||||
// [[=a=]]
|
||||
// ^
|
||||
EquivalenceClassExpression,
|
||||
};
|
||||
|
||||
static boost::regex compile_regex(std::string_view re) {
|
||||
// Make sure that Boost supports everything that C++ regexes do,
|
||||
// and no non-standard extensions are available.
|
||||
//
|
||||
// In particular, C++ regexes only support escaping regex metacharacters.
|
||||
// They don't support other escape sequences like `\n` and `\d`.
|
||||
// Additionally, within character groups, it's not possible to escape
|
||||
// anything, backslash is a literal character in those. `[\]` in regexes
|
||||
// is a weird way to write `\\`.
|
||||
std::string boost_re;
|
||||
boost_re.reserve(re.size());
|
||||
auto state = RegexParseState::Regular;
|
||||
for (char c : re) {
|
||||
switch (state) {
|
||||
case RegexParseState::Regular:
|
||||
switch (c) {
|
||||
// Boost regex engine supports more escape sequences than C++ regexes,
|
||||
// and as such it's necessary to ensure only escapes supported by C++
|
||||
// are allowed.
|
||||
case '\\':
|
||||
state = RegexParseState::Backslash;
|
||||
break;
|
||||
case '[':
|
||||
state = RegexParseState::CharacterSetStart;
|
||||
break;
|
||||
case '{':
|
||||
state = RegexParseState::BoundedRepeat;
|
||||
break;
|
||||
// Boost doesn't permit unescaped `}`, escape it outside of
|
||||
// bounded repeats.
|
||||
case '}':
|
||||
boost_re.push_back('\\');
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
break;
|
||||
|
||||
case RegexParseState::BoundedRepeat:
|
||||
if (c == '}') {
|
||||
state = RegexParseState::Regular;
|
||||
}
|
||||
break;
|
||||
|
||||
case RegexParseState::Backslash:
|
||||
switch (c) {
|
||||
case '.': case '|': case '*': case '?': case '+': case '{':
|
||||
case '^': case '$': case '[': case '(': case ')': case '\\':
|
||||
state = RegexParseState::Regular;
|
||||
break;
|
||||
default:
|
||||
throw boost::regex_error(
|
||||
boost::regex_constants::error_type::error_escape
|
||||
);
|
||||
}
|
||||
break;
|
||||
|
||||
case RegexParseState::CharacterSetStart:
|
||||
if (c == '^') {
|
||||
state = RegexParseState::NegatedCharacterSetStart;
|
||||
break;
|
||||
}
|
||||
[[fallthrough]];
|
||||
|
||||
case RegexParseState::NegatedCharacterSetStart:
|
||||
if (c == ']') {
|
||||
state = RegexParseState::CharacterSetMiddle;
|
||||
break;
|
||||
}
|
||||
[[fallthrough]];
|
||||
|
||||
case RegexParseState::CharacterSetMiddle:
|
||||
middle:
|
||||
switch (c) {
|
||||
case '[':
|
||||
state = RegexParseState::PossibleCharacterSetExtension;
|
||||
break;
|
||||
case '\\':
|
||||
// Backslashes aren't supported in character groups, escape them
|
||||
boost_re.push_back('\\');
|
||||
state = RegexParseState::CharacterSetMiddle;
|
||||
break;
|
||||
case ']':
|
||||
state = RegexParseState::Regular;
|
||||
break;
|
||||
default:
|
||||
state = RegexParseState::CharacterSetMiddle;
|
||||
break;
|
||||
}
|
||||
break;
|
||||
|
||||
case RegexParseState::PossibleCharacterSetExtension:
|
||||
switch (c) {
|
||||
case ':': case '.':
|
||||
state = RegexParseState::CharacterSetExtension;
|
||||
break;
|
||||
case '=':
|
||||
state = RegexParseState::EquivalenceClassExpression;
|
||||
break;
|
||||
default:
|
||||
goto middle;
|
||||
}
|
||||
break;
|
||||
|
||||
case RegexParseState::CharacterSetExtension:
|
||||
if (c == ']') {
|
||||
state = RegexParseState::CharacterSetMiddle;
|
||||
}
|
||||
break;
|
||||
|
||||
case RegexParseState::EquivalenceClassExpression:
|
||||
// C++'s regex parser only supports equivalence classes for
|
||||
// alphabetic characters
|
||||
if (!((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))) {
|
||||
throw boost::regex_error(
|
||||
boost::regex_constants::error_type::error_brack
|
||||
);
|
||||
}
|
||||
// After verifying first character, this can be parsed as
|
||||
// a regular character set extension, Boost will notice issues
|
||||
// after that.
|
||||
state = RegexParseState::CharacterSetExtension;
|
||||
break;
|
||||
}
|
||||
|
||||
boost_re.push_back(c);
|
||||
}
|
||||
return boost::regex(boost_re, boost::regex::extended);
|
||||
}
|
||||
|
||||
struct RegexCache
|
||||
{
|
||||
// TODO use C++20 transparent comparison when available
|
||||
std::unordered_map<std::string_view, std::regex> cache;
|
||||
std::unordered_map<std::string_view, boost::regex> cache;
|
||||
std::list<std::string> keys;
|
||||
|
||||
std::regex get(std::string_view re)
|
||||
boost::regex get(std::string_view re)
|
||||
{
|
||||
auto it = cache.find(re);
|
||||
if (it != cache.end())
|
||||
return it->second;
|
||||
keys.emplace_back(re);
|
||||
return cache.emplace(keys.back(), std::regex(keys.back(), std::regex::extended)).first->second;
|
||||
return cache.emplace(keys.back(), compile_regex(re)).first->second;
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -3910,8 +4096,8 @@ void prim_match(EvalState & state, const PosIdx pos, Value * * args, Value & v)
|
|||
NixStringContext context;
|
||||
const auto str = state.forceString(*args[1], context, pos, "while evaluating the second argument passed to builtins.match");
|
||||
|
||||
std::cmatch match;
|
||||
if (!std::regex_match(str.begin(), str.end(), match, regex)) {
|
||||
boost::cmatch match;
|
||||
if (!boost::regex_match(str.begin(), str.end(), match, regex)) {
|
||||
v.mkNull();
|
||||
return;
|
||||
}
|
||||
|
@ -3926,8 +4112,8 @@ void prim_match(EvalState & state, const PosIdx pos, Value * * args, Value & v)
|
|||
(v.listElems()[i] = state.allocValue())->mkString(match[i + 1].str());
|
||||
}
|
||||
|
||||
} catch (std::regex_error & e) {
|
||||
if (e.code() == std::regex_constants::error_space) {
|
||||
} catch (boost::regex_error & e) {
|
||||
if (e.code() == boost::regex_constants::error_space) {
|
||||
// limit is _GLIBCXX_REGEX_STATE_LIMIT for libstdc++
|
||||
state.error<EvalError>("memory limit exceeded by regular expression '%s'", re)
|
||||
.atPos(pos)
|
||||
|
@ -3988,8 +4174,8 @@ void prim_split(EvalState & state, const PosIdx pos, Value * * args, Value & v)
|
|||
NixStringContext context;
|
||||
const auto str = state.forceString(*args[1], context, pos, "while evaluating the second argument passed to builtins.split");
|
||||
|
||||
auto begin = std::cregex_iterator(str.begin(), str.end(), regex);
|
||||
auto end = std::cregex_iterator();
|
||||
auto begin = boost::cregex_iterator(str.begin(), str.end(), regex);
|
||||
auto end = boost::cregex_iterator();
|
||||
|
||||
// Any matches results are surrounded by non-matching results.
|
||||
const size_t len = std::distance(begin, end);
|
||||
|
@ -4028,8 +4214,8 @@ void prim_split(EvalState & state, const PosIdx pos, Value * * args, Value & v)
|
|||
|
||||
assert(idx == 2 * len + 1);
|
||||
|
||||
} catch (std::regex_error & e) {
|
||||
if (e.code() == std::regex_constants::error_space) {
|
||||
} catch (boost::regex_error & e) {
|
||||
if (e.code() == boost::regex_constants::error_space) {
|
||||
// limit is _GLIBCXX_REGEX_STATE_LIMIT for libstdc++
|
||||
state.error<EvalError>("memory limit exceeded by regular expression '%s'", re)
|
||||
.atPos(pos)
|
||||
|
|
|
@ -816,6 +816,151 @@ namespace nix {
|
|||
ASSERT_THAT(*v.listElems()[0], IsStringEq("FOO"));
|
||||
}
|
||||
|
||||
TEST_F(PrimOpTest, match5) {
|
||||
auto v = eval("builtins.match ''}'' ''}''");
|
||||
ASSERT_THAT(v, IsListOfSize(0));
|
||||
}
|
||||
|
||||
TEST_F(PrimOpTest, match6) {
|
||||
auto v = eval("builtins.match '']'' '']''");
|
||||
ASSERT_THAT(v, IsListOfSize(0));
|
||||
}
|
||||
|
||||
TEST_F(PrimOpTest, match7) {
|
||||
auto v = eval("builtins.match ''[[]'' ''[''");
|
||||
ASSERT_THAT(v, IsListOfSize(0));
|
||||
}
|
||||
|
||||
TEST_F(PrimOpTest, match8) {
|
||||
auto v = eval("builtins.match ''[]]'' '']''");
|
||||
ASSERT_THAT(v, IsListOfSize(0));
|
||||
}
|
||||
|
||||
TEST_F(PrimOpTest, match9) {
|
||||
auto v = eval("builtins.match ''[[=a=]]'' ''A''");
|
||||
ASSERT_THAT(v, IsListOfSize(0));
|
||||
}
|
||||
|
||||
TEST_F(PrimOpTest, match10) {
|
||||
auto v = eval("builtins.match ''[[.right-curly-bracket.]]'' ''}''");
|
||||
ASSERT_THAT(v, IsListOfSize(0));
|
||||
}
|
||||
|
||||
TEST_F(PrimOpTest, match11) {
|
||||
auto v = eval("builtins.match ''[[.tilde.]]'' ''~''");
|
||||
ASSERT_THAT(v, IsListOfSize(0));
|
||||
}
|
||||
|
||||
TEST_F(PrimOpTest, match12) {
|
||||
auto v = eval("builtins.match ''[\\n]'' ''\\''");
|
||||
ASSERT_THAT(v, IsListOfSize(0));
|
||||
}
|
||||
|
||||
TEST_F(PrimOpTest, match13) {
|
||||
auto v = eval("builtins.match ''[\\[]'' ''\\''");
|
||||
ASSERT_THAT(v, IsListOfSize(0));
|
||||
}
|
||||
|
||||
TEST_F(PrimOpTest, match14) {
|
||||
auto v = eval("builtins.match ''[\\]]'' ''\\]''");
|
||||
ASSERT_THAT(v, IsListOfSize(0));
|
||||
}
|
||||
|
||||
TEST_F(PrimOpTest, match15) {
|
||||
auto v = eval("builtins.match ''[\\-z]'' ''y''");
|
||||
ASSERT_THAT(v, IsListOfSize(0));
|
||||
}
|
||||
|
||||
TEST_F(PrimOpTest, match16) {
|
||||
auto v = eval("builtins.match ''[\\\\]'' ''\\''");
|
||||
ASSERT_THAT(v, IsListOfSize(0));
|
||||
}
|
||||
|
||||
TEST_F(PrimOpTest, match17) {
|
||||
auto v = eval("builtins.match ''[\\]'' ''\\''");
|
||||
ASSERT_THAT(v, IsListOfSize(0));
|
||||
}
|
||||
|
||||
TEST_F(PrimOpTest, match18) {
|
||||
auto v = eval("builtins.match ''[\\]]'' ''\\]''");
|
||||
ASSERT_THAT(v, IsListOfSize(0));
|
||||
}
|
||||
|
||||
TEST_F(PrimOpTest, match19) {
|
||||
auto v = eval("builtins.match ''.*[Ω].*'' ''β''");
|
||||
ASSERT_THAT(v, IsListOfSize(0));
|
||||
}
|
||||
|
||||
TEST_F(PrimOpTest, match20) {
|
||||
auto v = eval("builtins.match ''[^]]'' ''a''");
|
||||
ASSERT_THAT(v, IsListOfSize(0));
|
||||
}
|
||||
|
||||
TEST_F(PrimOpTest, match21) {
|
||||
auto v = eval("builtins.match ''[[[:alpha:]]'' ''[''");
|
||||
ASSERT_THAT(v, IsListOfSize(0));
|
||||
}
|
||||
|
||||
TEST_F(PrimOpTest, match_unsupported_syntax1) {
|
||||
ASSERT_THROW(eval("builtins.match ''{'' ''{''"), EvalError);
|
||||
}
|
||||
|
||||
TEST_F(PrimOpTest, match_unsupported_syntax2) {
|
||||
ASSERT_THROW(eval("builtins.match ''(a)\\1'' ''aa''"), EvalError);
|
||||
}
|
||||
|
||||
TEST_F(PrimOpTest, match_unsupported_syntax3) {
|
||||
ASSERT_THROW(eval("builtins.match ''\\}'' ''}''"), EvalError);
|
||||
}
|
||||
|
||||
TEST_F(PrimOpTest, match_unsupported_syntax4) {
|
||||
ASSERT_THROW(eval("builtins.match ''\\]'' '']''"), EvalError);
|
||||
}
|
||||
|
||||
TEST_F(PrimOpTest, match_unsupported_syntax5) {
|
||||
ASSERT_THROW(eval("builtins.match ''\\x41'' ''A''"), EvalError);
|
||||
}
|
||||
|
||||
TEST_F(PrimOpTest, match_unsupported_syntax6) {
|
||||
ASSERT_THROW(eval("builtins.match ''\\n'' \"\\n\""), EvalError);
|
||||
}
|
||||
|
||||
TEST_F(PrimOpTest, match_unsupported_syntax7) {
|
||||
ASSERT_THROW(eval("builtins.match ''\\d'' ''1''"), EvalError);
|
||||
}
|
||||
|
||||
TEST_F(PrimOpTest, match_unsupported_syntax8) {
|
||||
ASSERT_THROW(eval("builtins.match ''\\b1'' ''1''"), EvalError);
|
||||
}
|
||||
|
||||
TEST_F(PrimOpTest, match_unsupported_syntax9) {
|
||||
ASSERT_THROW(eval("builtins.match ''\\A1'' ''1''"), EvalError);
|
||||
}
|
||||
|
||||
TEST_F(PrimOpTest, match_unsupported_syntax10) {
|
||||
ASSERT_THROW(eval("builtins.match ''(?:1)'' ''1''"), EvalError);
|
||||
}
|
||||
|
||||
TEST_F(PrimOpTest, match_unsupported_syntax11) {
|
||||
ASSERT_THROW(eval("builtins.match ''[b-a]'' ''b''"), EvalError);
|
||||
}
|
||||
|
||||
TEST_F(PrimOpTest, match_unsupported_syntax12) {
|
||||
ASSERT_THROW(eval("builtins.match ''[[:alpha:]-a]'' ''b''"), EvalError);
|
||||
}
|
||||
|
||||
TEST_F(PrimOpTest, match_unsupported_syntax13) {
|
||||
ASSERT_THROW(eval("builtins.match ''[[=1=]]'' ''1''"), EvalError);
|
||||
}
|
||||
|
||||
TEST_F(PrimOpTest, match_unsupported_syntax14) {
|
||||
ASSERT_THROW(eval("builtins.match ''[[=]=]]'' '']''"), EvalError);
|
||||
}
|
||||
|
||||
TEST_F(PrimOpTest, match_unsupported_syntax15) {
|
||||
ASSERT_THROW(eval("builtins.match ''[a-b-c]'' ''b''"), EvalError);
|
||||
}
|
||||
|
||||
TEST_F(PrimOpTest, attrNames) {
|
||||
auto v = eval("builtins.attrNames { x = 1; y = 2; z = 3; a = 2; }");
|
||||
ASSERT_THAT(v, IsListOfSize(4));
|
||||
|
|
Loading…
Reference in a new issue