From 9896d309cbf3e4c0888760981654c1da0b5983a9 Mon Sep 17 00:00:00 2001 From: jade Date: Thu, 22 Aug 2024 18:35:11 +0000 Subject: [PATCH] Revert "libexpr: Replace regex engine with boost::regex" This reverts commit 447212fa65a80180150b265411924cc638a2c52c. Reason for revert: Regression in eval behaviour bug-compatibility. Expected behaviour (Nix 2.18.5, macOS and Linux [libstdc++/libc++]): ``` nix-repl> builtins.match "\\.*(.*)" ".keep" [ "keep" ] nix-repl> builtins.match "(\\.*)(.*)" ".keep" [ "." "keep" ] ``` Actual behaviour (boost::regex): ``` nix-repl> builtins.match "\\.*(.*)" ".keep" [ ".keep" ] nix-repl> builtins.match "(\\.*)(.*)" ".keep" [ "." "keep" ] ``` Bug: https://git.lix.systems/lix-project/lix/issues/483 Change-Id: Id462eb8586dcd54856cf095f09b3e3a216955b60 --- doc/manual/change-authors.yml | 4 - doc/manual/rl-next/boost-regex.md | 37 ------ src/libexpr/primops.cc | 210 ++---------------------------- tests/unit/libexpr/primops.cc | 145 --------------------- 4 files changed, 12 insertions(+), 384 deletions(-) delete mode 100644 doc/manual/rl-next/boost-regex.md diff --git a/doc/manual/change-authors.yml b/doc/manual/change-authors.yml index d9303a747..e18abada1 100644 --- a/doc/manual/change-authors.yml +++ b/doc/manual/change-authors.yml @@ -129,10 +129,6 @@ roberth: display_name: Robert Hensing github: roberth -sugar: - forgejo: sugar - github: sugar700 - thufschmitt: display_name: Théophane Hufschmitt github: thufschmitt diff --git a/doc/manual/rl-next/boost-regex.md b/doc/manual/rl-next/boost-regex.md deleted file mode 100644 index c541434d0..000000000 --- a/doc/manual/rl-next/boost-regex.md +++ /dev/null @@ -1,37 +0,0 @@ ---- -synopsis: Replace regex engine with boost::regex -issues: [fj#34, fj#476] -cls: [1821] -category: Fixes -credits: [sugar] ---- - -Previously, the C++ standard regex expression library was used, the -behaviour of which varied depending on the platform. This has been -replaced with the Boost regex library, which works identically across -platforms. - -The visible behaviour of the regex functions doesn't change. While -the new library has more features, Lix will reject regular expressions -using them. - -This also fixes regex matching reporting stack overflow when matching -on too much data. - -Before: - - nix-repl> builtins.match ".*" ( - builtins.concatStringsSep "" ( - builtins.genList (_: "a") 1000000 - ) - ) - error: stack overflow (possible infinite recursion) - -After: - - nix-repl> builtins.match ".*" ( - builtins.concatStringsSep "" ( - builtins.genList (_: "a") 1000000 - ) - ) - [ ] diff --git a/src/libexpr/primops.cc b/src/libexpr/primops.cc index d6618df2a..dab96d6d4 100644 --- a/src/libexpr/primops.cc +++ b/src/libexpr/primops.cc @@ -17,7 +17,6 @@ #include "fetch-to-store.hh" #include -#include #include #include @@ -27,6 +26,7 @@ #include #include #include +#include #include #include @@ -3878,205 +3878,19 @@ static RegisterPrimOp primop_hashString({ .fun = prim_hashString, }); -enum class RegexParseState { - // Anything outside of those - Regular, - - // Bounded repeats, `}` shouldn't be escaped in those - // - // a{2,5}b - // ^^^^ - BoundedRepeat, - - // Backslashes, as C++ regexes only support escaping what needs to be - // escaped and nothing else - // - // a\nb - // ^ - Backslash, - - // Initial part of character set, as `[]]` is a regex for `]` character - // - // [abc] [^abc] - // ^ ^ - CharacterSetStart, - - // Initial part of negated character set, as `[^]]` is a regex for - // anything but `]` character - // - // [^abc] - // ^ - NegatedCharacterSetStart, - - // Character set after its first character - // - // [abc] - // ^^ - CharacterSetMiddle, - - // Parser state after seeing [, assumes the input is character extension - // after seeing `:`, `.`, or `=` - // - // [a[:alpha:]b] - // ^ - PossibleCharacterSetExtension, - - // Within character extension - // - // [a[:alpha:]b] - // ^^^^^^^ - CharacterSetExtension, - - // Within equivalence class expression - // - // [[=a=]] - // ^ - EquivalenceClassExpression, -}; - -static boost::regex compile_regex(std::string_view re) { - // Make sure that Boost supports everything that C++ regexes do, - // and no non-standard extensions are available. - // - // In particular, C++ regexes only support escaping regex metacharacters. - // They don't support other escape sequences like `\n` and `\d`. - // Additionally, within character groups, it's not possible to escape - // anything, backslash is a literal character in those. `[\]` in regexes - // is a weird way to write `\\`. - std::string boost_re; - boost_re.reserve(re.size()); - auto state = RegexParseState::Regular; - for (char c : re) { - switch (state) { - case RegexParseState::Regular: - switch (c) { - // Boost regex engine supports more escape sequences than C++ regexes, - // and as such it's necessary to ensure only escapes supported by C++ - // are allowed. - case '\\': - state = RegexParseState::Backslash; - break; - case '[': - state = RegexParseState::CharacterSetStart; - break; - case '{': - state = RegexParseState::BoundedRepeat; - break; - // Boost doesn't permit unescaped `}`, escape it outside of - // bounded repeats. - case '}': - boost_re.push_back('\\'); - break; - default: - break; - } - break; - - case RegexParseState::BoundedRepeat: - if (c == '}') { - state = RegexParseState::Regular; - } - break; - - case RegexParseState::Backslash: - switch (c) { - case '.': case '|': case '*': case '?': case '+': case '{': - case '^': case '$': case '[': case '(': case ')': case '\\': - state = RegexParseState::Regular; - break; - default: - throw boost::regex_error( - boost::regex_constants::error_type::error_escape - ); - } - break; - - case RegexParseState::CharacterSetStart: - if (c == '^') { - state = RegexParseState::NegatedCharacterSetStart; - break; - } - [[fallthrough]]; - - case RegexParseState::NegatedCharacterSetStart: - if (c == ']') { - state = RegexParseState::CharacterSetMiddle; - break; - } - [[fallthrough]]; - - case RegexParseState::CharacterSetMiddle: - middle: - switch (c) { - case '[': - state = RegexParseState::PossibleCharacterSetExtension; - break; - case '\\': - // Backslashes aren't supported in character groups, escape them - boost_re.push_back('\\'); - state = RegexParseState::CharacterSetMiddle; - break; - case ']': - state = RegexParseState::Regular; - break; - default: - state = RegexParseState::CharacterSetMiddle; - break; - } - break; - - case RegexParseState::PossibleCharacterSetExtension: - switch (c) { - case ':': case '.': - state = RegexParseState::CharacterSetExtension; - break; - case '=': - state = RegexParseState::EquivalenceClassExpression; - break; - default: - goto middle; - } - break; - - case RegexParseState::CharacterSetExtension: - if (c == ']') { - state = RegexParseState::CharacterSetMiddle; - } - break; - - case RegexParseState::EquivalenceClassExpression: - // C++'s regex parser only supports equivalence classes for - // alphabetic characters - if (!((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))) { - throw boost::regex_error( - boost::regex_constants::error_type::error_brack - ); - } - // After verifying first character, this can be parsed as - // a regular character set extension, Boost will notice issues - // after that. - state = RegexParseState::CharacterSetExtension; - break; - } - - boost_re.push_back(c); - } - return boost::regex(boost_re, boost::regex::extended); -} - struct RegexCache { // TODO use C++20 transparent comparison when available - std::unordered_map cache; + std::unordered_map cache; std::list keys; - boost::regex get(std::string_view re) + std::regex get(std::string_view re) { auto it = cache.find(re); if (it != cache.end()) return it->second; keys.emplace_back(re); - return cache.emplace(keys.back(), compile_regex(re)).first->second; + return cache.emplace(keys.back(), std::regex(keys.back(), std::regex::extended)).first->second; } }; @@ -4096,8 +3910,8 @@ void prim_match(EvalState & state, const PosIdx pos, Value * * args, Value & v) NixStringContext context; const auto str = state.forceString(*args[1], context, pos, "while evaluating the second argument passed to builtins.match"); - boost::cmatch match; - if (!boost::regex_match(str.begin(), str.end(), match, regex)) { + std::cmatch match; + if (!std::regex_match(str.begin(), str.end(), match, regex)) { v.mkNull(); return; } @@ -4112,8 +3926,8 @@ void prim_match(EvalState & state, const PosIdx pos, Value * * args, Value & v) (v.listElems()[i] = state.allocValue())->mkString(match[i + 1].str()); } - } catch (boost::regex_error & e) { - if (e.code() == boost::regex_constants::error_space) { + } catch (std::regex_error & e) { + if (e.code() == std::regex_constants::error_space) { // limit is _GLIBCXX_REGEX_STATE_LIMIT for libstdc++ state.error("memory limit exceeded by regular expression '%s'", re) .atPos(pos) @@ -4174,8 +3988,8 @@ void prim_split(EvalState & state, const PosIdx pos, Value * * args, Value & v) NixStringContext context; const auto str = state.forceString(*args[1], context, pos, "while evaluating the second argument passed to builtins.split"); - auto begin = boost::cregex_iterator(str.begin(), str.end(), regex); - auto end = boost::cregex_iterator(); + auto begin = std::cregex_iterator(str.begin(), str.end(), regex); + auto end = std::cregex_iterator(); // Any matches results are surrounded by non-matching results. const size_t len = std::distance(begin, end); @@ -4214,8 +4028,8 @@ void prim_split(EvalState & state, const PosIdx pos, Value * * args, Value & v) assert(idx == 2 * len + 1); - } catch (boost::regex_error & e) { - if (e.code() == boost::regex_constants::error_space) { + } catch (std::regex_error & e) { + if (e.code() == std::regex_constants::error_space) { // limit is _GLIBCXX_REGEX_STATE_LIMIT for libstdc++ state.error("memory limit exceeded by regular expression '%s'", re) .atPos(pos) diff --git a/tests/unit/libexpr/primops.cc b/tests/unit/libexpr/primops.cc index b73fdff72..bd174a6c0 100644 --- a/tests/unit/libexpr/primops.cc +++ b/tests/unit/libexpr/primops.cc @@ -816,151 +816,6 @@ namespace nix { ASSERT_THAT(*v.listElems()[0], IsStringEq("FOO")); } - TEST_F(PrimOpTest, match5) { - auto v = eval("builtins.match ''}'' ''}''"); - ASSERT_THAT(v, IsListOfSize(0)); - } - - TEST_F(PrimOpTest, match6) { - auto v = eval("builtins.match '']'' '']''"); - ASSERT_THAT(v, IsListOfSize(0)); - } - - TEST_F(PrimOpTest, match7) { - auto v = eval("builtins.match ''[[]'' ''[''"); - ASSERT_THAT(v, IsListOfSize(0)); - } - - TEST_F(PrimOpTest, match8) { - auto v = eval("builtins.match ''[]]'' '']''"); - ASSERT_THAT(v, IsListOfSize(0)); - } - - TEST_F(PrimOpTest, match9) { - auto v = eval("builtins.match ''[[=a=]]'' ''A''"); - ASSERT_THAT(v, IsListOfSize(0)); - } - - TEST_F(PrimOpTest, match10) { - auto v = eval("builtins.match ''[[.right-curly-bracket.]]'' ''}''"); - ASSERT_THAT(v, IsListOfSize(0)); - } - - TEST_F(PrimOpTest, match11) { - auto v = eval("builtins.match ''[[.tilde.]]'' ''~''"); - ASSERT_THAT(v, IsListOfSize(0)); - } - - TEST_F(PrimOpTest, match12) { - auto v = eval("builtins.match ''[\\n]'' ''\\''"); - ASSERT_THAT(v, IsListOfSize(0)); - } - - TEST_F(PrimOpTest, match13) { - auto v = eval("builtins.match ''[\\[]'' ''\\''"); - ASSERT_THAT(v, IsListOfSize(0)); - } - - TEST_F(PrimOpTest, match14) { - auto v = eval("builtins.match ''[\\]]'' ''\\]''"); - ASSERT_THAT(v, IsListOfSize(0)); - } - - TEST_F(PrimOpTest, match15) { - auto v = eval("builtins.match ''[\\-z]'' ''y''"); - ASSERT_THAT(v, IsListOfSize(0)); - } - - TEST_F(PrimOpTest, match16) { - auto v = eval("builtins.match ''[\\\\]'' ''\\''"); - ASSERT_THAT(v, IsListOfSize(0)); - } - - TEST_F(PrimOpTest, match17) { - auto v = eval("builtins.match ''[\\]'' ''\\''"); - ASSERT_THAT(v, IsListOfSize(0)); - } - - TEST_F(PrimOpTest, match18) { - auto v = eval("builtins.match ''[\\]]'' ''\\]''"); - ASSERT_THAT(v, IsListOfSize(0)); - } - - TEST_F(PrimOpTest, match19) { - auto v = eval("builtins.match ''.*[Ω].*'' ''β''"); - ASSERT_THAT(v, IsListOfSize(0)); - } - - TEST_F(PrimOpTest, match20) { - auto v = eval("builtins.match ''[^]]'' ''a''"); - ASSERT_THAT(v, IsListOfSize(0)); - } - - TEST_F(PrimOpTest, match21) { - auto v = eval("builtins.match ''[[[:alpha:]]'' ''[''"); - ASSERT_THAT(v, IsListOfSize(0)); - } - - TEST_F(PrimOpTest, match_unsupported_syntax1) { - ASSERT_THROW(eval("builtins.match ''{'' ''{''"), EvalError); - } - - TEST_F(PrimOpTest, match_unsupported_syntax2) { - ASSERT_THROW(eval("builtins.match ''(a)\\1'' ''aa''"), EvalError); - } - - TEST_F(PrimOpTest, match_unsupported_syntax3) { - ASSERT_THROW(eval("builtins.match ''\\}'' ''}''"), EvalError); - } - - TEST_F(PrimOpTest, match_unsupported_syntax4) { - ASSERT_THROW(eval("builtins.match ''\\]'' '']''"), EvalError); - } - - TEST_F(PrimOpTest, match_unsupported_syntax5) { - ASSERT_THROW(eval("builtins.match ''\\x41'' ''A''"), EvalError); - } - - TEST_F(PrimOpTest, match_unsupported_syntax6) { - ASSERT_THROW(eval("builtins.match ''\\n'' \"\\n\""), EvalError); - } - - TEST_F(PrimOpTest, match_unsupported_syntax7) { - ASSERT_THROW(eval("builtins.match ''\\d'' ''1''"), EvalError); - } - - TEST_F(PrimOpTest, match_unsupported_syntax8) { - ASSERT_THROW(eval("builtins.match ''\\b1'' ''1''"), EvalError); - } - - TEST_F(PrimOpTest, match_unsupported_syntax9) { - ASSERT_THROW(eval("builtins.match ''\\A1'' ''1''"), EvalError); - } - - TEST_F(PrimOpTest, match_unsupported_syntax10) { - ASSERT_THROW(eval("builtins.match ''(?:1)'' ''1''"), EvalError); - } - - TEST_F(PrimOpTest, match_unsupported_syntax11) { - ASSERT_THROW(eval("builtins.match ''[b-a]'' ''b''"), EvalError); - } - - TEST_F(PrimOpTest, match_unsupported_syntax12) { - ASSERT_THROW(eval("builtins.match ''[[:alpha:]-a]'' ''b''"), EvalError); - } - - TEST_F(PrimOpTest, match_unsupported_syntax13) { - ASSERT_THROW(eval("builtins.match ''[[=1=]]'' ''1''"), EvalError); - } - - TEST_F(PrimOpTest, match_unsupported_syntax14) { - ASSERT_THROW(eval("builtins.match ''[[=]=]]'' '']''"), EvalError); - } - - TEST_F(PrimOpTest, match_unsupported_syntax15) { - ASSERT_THROW(eval("builtins.match ''[a-b-c]'' ''b''"), EvalError); - } - TEST_F(PrimOpTest, attrNames) { auto v = eval("builtins.attrNames { x = 1; y = 2; z = 3; a = 2; }"); ASSERT_THAT(v, IsListOfSize(4));