Revert "libexpr: Replace regex engine with boost::regex"

This reverts commit 447212fa65. Reason for revert: Regression in eval behaviour bug-compatibility. Expected behaviour (Nix 2.18.5, macOS and Linux [libstdc++/libc++]): ``` nix-repl> builtins.match "\\.*(.*)" ".keep" [ "keep" ] nix-repl> builtins.match "(\\.*)(.*)" ".keep" [ "." "keep" ] ``` Actual behaviour (boost::regex): ``` nix-repl> builtins.match "\\.*(.*)" ".keep" [ ".keep" ] nix-repl> builtins.match "(\\.*)(.*)" ".keep" [ "." "keep" ] ``` Bug: https://git.lix.systems/lix-project/lix/issues/483 Change-Id: Id462eb8586dcd54856cf095f09b3e3a216955b60
2024-08-22 18:35:11 +00:00 · 2024-08-22 18:35:11 +00:00 · 9896d309cb
commit 9896d309cb
parent 447212fa65
4 changed files with 12 additions and 384 deletions
--- a/doc/manual/change-authors.yml
+++ b/doc/manual/change-authors.yml
@ -129,10 +129,6 @@ roberth:
  display_name: Robert Hensing
  github: roberth

-sugar:
-  forgejo: sugar
-  github: sugar700
-
 thufschmitt:
  display_name: Théophane Hufschmitt
  github: thufschmitt
--- a/doc/manual/rl-next/boost-regex.md
+++ b/doc/manual/rl-next/boost-regex.md
@ -1,37 +0,0 @@
---
-synopsis: Replace regex engine with boost::regex
-issues: [fj#34, fj#476]
-cls: [1821]
-category: Fixes
-credits: [sugar]
---
-
-Previously, the C++ standard regex expression library was used, the
-behaviour of which varied depending on the platform. This has been
-replaced with the Boost regex library, which works identically across
-platforms.
-
-The visible behaviour of the regex functions doesn't change. While
-the new library has more features, Lix will reject regular expressions
-using them.
-
-This also fixes regex matching reporting stack overflow when matching
-on too much data.
-
-Before:
-
-    nix-repl> builtins.match ".*" (
-                builtins.concatStringsSep "" (
-                  builtins.genList (_: "a") 1000000
-                )
-              )
-    error: stack overflow (possible infinite recursion)
-
-After:
-
-    nix-repl> builtins.match ".*" (
-                builtins.concatStringsSep "" (
-                  builtins.genList (_: "a") 1000000
-                )
-              )
-    [ ]
--- a/src/libexpr/primops.cc
+++ b/src/libexpr/primops.cc
@ -17,7 +17,6 @@
 #include "fetch-to-store.hh"

 #include <boost/container/small_vector.hpp>
-#include <boost/regex.hpp>
 #include <nlohmann/json.hpp>

 #include <sys/types.h>
@ -27,6 +26,7 @@
 #include <algorithm>
 #include <cstring>
 #include <sstream>
+#include <regex>
 #include <dlfcn.h>

 #include <cmath>
@ -3878,205 +3878,19 @@ static RegisterPrimOp primop_hashString({
    .fun = prim_hashString,
 });

-enum class RegexParseState {
-    // Anything outside of those
-    Regular,
-
-    // Bounded repeats, `}` shouldn't be escaped in those
-    //
-    // a{2,5}b
-    //   ^^^^
-    BoundedRepeat,
-
-    // Backslashes, as C++ regexes only support escaping what needs to be
-    // escaped and nothing else
-    //
-    // a\nb
-    //   ^
-    Backslash,
-
-    // Initial part of character set, as `[]]` is a regex for `]` character
-    //
-    // [abc] [^abc]
-    //  ^     ^
-    CharacterSetStart,
-
-    // Initial part of negated character set, as `[^]]` is a regex for
-    // anything but `]` character
-    //
-    // [^abc]
-    //   ^
-    NegatedCharacterSetStart,
-
-    // Character set after its first character
-    //
-    // [abc]
-    //   ^^
-    CharacterSetMiddle,
-
-    // Parser state after seeing [, assumes the input is character extension
-    // after seeing `:`, `.`, or `=`
-    //
-    // [a[:alpha:]b]
-    //    ^
-    PossibleCharacterSetExtension,
-
-    // Within character extension
-    //
-    // [a[:alpha:]b]
-    //     ^^^^^^^
-    CharacterSetExtension,
-
-    // Within equivalence class expression
-    //
-    // [[=a=]]
-    //    ^
-    EquivalenceClassExpression,
-};
-
-static boost::regex compile_regex(std::string_view re) {
-    // Make sure that Boost supports everything that C++ regexes do,
-    // and no non-standard extensions are available.
-    //
-    // In particular, C++ regexes only support escaping regex metacharacters.
-    // They don't support other escape sequences like `\n` and `\d`.
-    // Additionally, within character groups, it's not possible to escape
-    // anything, backslash is a literal character in those. `[\]` in regexes
-    // is a weird way to write `\\`.
-    std::string boost_re;
-    boost_re.reserve(re.size());
-    auto state = RegexParseState::Regular;
-    for (char c : re) {
-        switch (state) {
-        case RegexParseState::Regular:
-            switch (c) {
-            // Boost regex engine supports more escape sequences than C++ regexes,
-            // and as such it's necessary to ensure only escapes supported by C++
-            // are allowed.
-            case '\\':
-                state = RegexParseState::Backslash;
-                break;
-            case '[':
-                state = RegexParseState::CharacterSetStart;
-                break;
-            case '{':
-                state = RegexParseState::BoundedRepeat;
-                break;
-            // Boost doesn't permit unescaped `}`, escape it outside of
-            // bounded repeats.
-            case '}':
-                boost_re.push_back('\\');
-                break;
-            default:
-                break;
-            }
-            break;
-
-        case RegexParseState::BoundedRepeat:
-            if (c == '}') {
-                state = RegexParseState::Regular;
-            }
-            break;
-
-        case RegexParseState::Backslash:
-            switch (c) {
-                case '.': case '|': case '*': case '?': case '+': case '{':
-                case '^': case '$': case '[': case '(': case ')': case '\\':
-                    state = RegexParseState::Regular;
-                    break;
-                default:
-                    throw boost::regex_error(
-                        boost::regex_constants::error_type::error_escape
-                    );
-            }
-            break;
-
-        case RegexParseState::CharacterSetStart:
-            if (c == '^') {
-                state = RegexParseState::NegatedCharacterSetStart;
-                break;
-            }
-            [[fallthrough]];
-
-        case RegexParseState::NegatedCharacterSetStart:
-            if (c == ']') {
-                state = RegexParseState::CharacterSetMiddle;
-                break;
-            }
-            [[fallthrough]];
-
-        case RegexParseState::CharacterSetMiddle:
-        middle:
-            switch (c) {
-            case '[':
-                state = RegexParseState::PossibleCharacterSetExtension;
-                break;
-            case '\\':
-                // Backslashes aren't supported in character groups, escape them
-                boost_re.push_back('\\');
-                state = RegexParseState::CharacterSetMiddle;
-                break;
-            case ']':
-                state = RegexParseState::Regular;
-                break;
-            default:
-                state = RegexParseState::CharacterSetMiddle;
-                break;
-            }
-            break;
-
-        case RegexParseState::PossibleCharacterSetExtension:
-            switch (c) {
-                case ':': case '.':
-                    state = RegexParseState::CharacterSetExtension;
-                    break;
-                case '=':
-                    state = RegexParseState::EquivalenceClassExpression;
-                    break;
-                default:
-                    goto middle;
-            }
-            break;
-
-        case RegexParseState::CharacterSetExtension:
-            if (c == ']') {
-                state = RegexParseState::CharacterSetMiddle;
-            }
-            break;
-
-        case RegexParseState::EquivalenceClassExpression:
-            // C++'s regex parser only supports equivalence classes for
-            // alphabetic characters
-            if (!((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))) {
-                throw boost::regex_error(
-                    boost::regex_constants::error_type::error_brack
-                );
-            }
-            // After verifying first character, this can be parsed as
-            // a regular character set extension, Boost will notice issues
-            // after that.
-            state = RegexParseState::CharacterSetExtension;
-            break;
-        }
-
-        boost_re.push_back(c);
-    }
-    return boost::regex(boost_re, boost::regex::extended);
-}
-
 struct RegexCache
 {
    // TODO use C++20 transparent comparison when available
-    std::unordered_map<std::string_view, boost::regex> cache;
+    std::unordered_map<std::string_view, std::regex> cache;
    std::list<std::string> keys;

-    boost::regex get(std::string_view re)
+    std::regex get(std::string_view re)
    {
        auto it = cache.find(re);
        if (it != cache.end())
            return it->second;
        keys.emplace_back(re);
-        return cache.emplace(keys.back(), compile_regex(re)).first->second;
+        return cache.emplace(keys.back(), std::regex(keys.back(), std::regex::extended)).first->second;
    }
 };

@ -4096,8 +3910,8 @@ void prim_match(EvalState & state, const PosIdx pos, Value * * args, Value & v)
        NixStringContext context;
        const auto str = state.forceString(*args[1], context, pos, "while evaluating the second argument passed to builtins.match");

-        boost::cmatch match;
-        if (!boost::regex_match(str.begin(), str.end(), match, regex)) {
+        std::cmatch match;
+        if (!std::regex_match(str.begin(), str.end(), match, regex)) {
            v.mkNull();
            return;
        }
@ -4112,8 +3926,8 @@ void prim_match(EvalState & state, const PosIdx pos, Value * * args, Value & v)
                (v.listElems()[i] = state.allocValue())->mkString(match[i + 1].str());
        }

-    } catch (boost::regex_error & e) {
-        if (e.code() == boost::regex_constants::error_space) {
+    } catch (std::regex_error & e) {
+        if (e.code() == std::regex_constants::error_space) {
            // limit is _GLIBCXX_REGEX_STATE_LIMIT for libstdc++
            state.error<EvalError>("memory limit exceeded by regular expression '%s'", re)
                .atPos(pos)
@ -4174,8 +3988,8 @@ void prim_split(EvalState & state, const PosIdx pos, Value * * args, Value & v)
        NixStringContext context;
        const auto str = state.forceString(*args[1], context, pos, "while evaluating the second argument passed to builtins.split");

-        auto begin = boost::cregex_iterator(str.begin(), str.end(), regex);
-        auto end = boost::cregex_iterator();
+        auto begin = std::cregex_iterator(str.begin(), str.end(), regex);
+        auto end = std::cregex_iterator();

        // Any matches results are surrounded by non-matching results.
        const size_t len = std::distance(begin, end);
@ -4214,8 +4028,8 @@ void prim_split(EvalState & state, const PosIdx pos, Value * * args, Value & v)

        assert(idx == 2 * len + 1);

-    } catch (boost::regex_error & e) {
-        if (e.code() == boost::regex_constants::error_space) {
+    } catch (std::regex_error & e) {
+        if (e.code() == std::regex_constants::error_space) {
            // limit is _GLIBCXX_REGEX_STATE_LIMIT for libstdc++
            state.error<EvalError>("memory limit exceeded by regular expression '%s'", re)
                .atPos(pos)
--- a/tests/unit/libexpr/primops.cc
+++ b/tests/unit/libexpr/primops.cc
@ -816,151 +816,6 @@ namespace nix {
        ASSERT_THAT(*v.listElems()[0], IsStringEq("FOO"));
    }

-    TEST_F(PrimOpTest, match5) {
-        auto v = eval("builtins.match ''}'' ''}''");
-        ASSERT_THAT(v, IsListOfSize(0));
-    }
-
-    TEST_F(PrimOpTest, match6) {
-        auto v = eval("builtins.match '']'' '']''");
-        ASSERT_THAT(v, IsListOfSize(0));
-    }
-
-    TEST_F(PrimOpTest, match7) {
-        auto v = eval("builtins.match ''[[]'' ''[''");
-        ASSERT_THAT(v, IsListOfSize(0));
-    }
-
-    TEST_F(PrimOpTest, match8) {
-        auto v = eval("builtins.match ''[]]'' '']''");
-        ASSERT_THAT(v, IsListOfSize(0));
-    }
-
-    TEST_F(PrimOpTest, match9) {
-        auto v = eval("builtins.match ''[[=a=]]'' ''A''");
-        ASSERT_THAT(v, IsListOfSize(0));
-    }
-
-    TEST_F(PrimOpTest, match10) {
-        auto v = eval("builtins.match ''[[.right-curly-bracket.]]'' ''}''");
-        ASSERT_THAT(v, IsListOfSize(0));
-    }
-
-    TEST_F(PrimOpTest, match11) {
-        auto v = eval("builtins.match ''[[.tilde.]]'' ''~''");
-        ASSERT_THAT(v, IsListOfSize(0));
-    }
-
-    TEST_F(PrimOpTest, match12) {
-        auto v = eval("builtins.match ''[\\n]'' ''\\''");
-        ASSERT_THAT(v, IsListOfSize(0));
-    }
-
-    TEST_F(PrimOpTest, match13) {
-        auto v = eval("builtins.match ''[\\[]'' ''\\''");
-        ASSERT_THAT(v, IsListOfSize(0));
-    }
-
-    TEST_F(PrimOpTest, match14) {
-        auto v = eval("builtins.match ''[\\]]'' ''\\]''");
-        ASSERT_THAT(v, IsListOfSize(0));
-    }
-
-    TEST_F(PrimOpTest, match15) {
-        auto v = eval("builtins.match ''[\\-z]'' ''y''");
-        ASSERT_THAT(v, IsListOfSize(0));
-    }
-
-    TEST_F(PrimOpTest, match16) {
-        auto v = eval("builtins.match ''[\\\\]'' ''\\''");
-        ASSERT_THAT(v, IsListOfSize(0));
-    }
-
-    TEST_F(PrimOpTest, match17) {
-        auto v = eval("builtins.match ''[\\]'' ''\\''");
-        ASSERT_THAT(v, IsListOfSize(0));
-    }
-
-    TEST_F(PrimOpTest, match18) {
-        auto v = eval("builtins.match ''[\\]]'' ''\\]''");
-        ASSERT_THAT(v, IsListOfSize(0));
-    }
-
-    TEST_F(PrimOpTest, match19) {
-        auto v = eval("builtins.match ''.*[Ω].*'' ''β''");
-        ASSERT_THAT(v, IsListOfSize(0));
-    }
-
-    TEST_F(PrimOpTest, match20) {
-        auto v = eval("builtins.match ''[^]]'' ''a''");
-        ASSERT_THAT(v, IsListOfSize(0));
-    }
-
-    TEST_F(PrimOpTest, match21) {
-        auto v = eval("builtins.match ''[[[:alpha:]]'' ''[''");
-        ASSERT_THAT(v, IsListOfSize(0));
-    }
-
-    TEST_F(PrimOpTest, match_unsupported_syntax1) {
-        ASSERT_THROW(eval("builtins.match ''{'' ''{''"), EvalError);
-    }
-
-    TEST_F(PrimOpTest, match_unsupported_syntax2) {
-        ASSERT_THROW(eval("builtins.match ''(a)\\1'' ''aa''"), EvalError);
-    }
-
-    TEST_F(PrimOpTest, match_unsupported_syntax3) {
-        ASSERT_THROW(eval("builtins.match ''\\}'' ''}''"), EvalError);
-    }
-
-    TEST_F(PrimOpTest, match_unsupported_syntax4) {
-        ASSERT_THROW(eval("builtins.match ''\\]'' '']''"), EvalError);
-    }
-
-    TEST_F(PrimOpTest, match_unsupported_syntax5) {
-        ASSERT_THROW(eval("builtins.match ''\\x41'' ''A''"), EvalError);
-    }
-
-    TEST_F(PrimOpTest, match_unsupported_syntax6) {
-        ASSERT_THROW(eval("builtins.match ''\\n'' \"\\n\""), EvalError);
-    }
-
-    TEST_F(PrimOpTest, match_unsupported_syntax7) {
-        ASSERT_THROW(eval("builtins.match ''\\d'' ''1''"), EvalError);
-    }
-
-    TEST_F(PrimOpTest, match_unsupported_syntax8) {
-        ASSERT_THROW(eval("builtins.match ''\\b1'' ''1''"), EvalError);
-    }
-
-    TEST_F(PrimOpTest, match_unsupported_syntax9) {
-        ASSERT_THROW(eval("builtins.match ''\\A1'' ''1''"), EvalError);
-    }
-
-    TEST_F(PrimOpTest, match_unsupported_syntax10) {
-        ASSERT_THROW(eval("builtins.match ''(?:1)'' ''1''"), EvalError);
-    }
-
-    TEST_F(PrimOpTest, match_unsupported_syntax11) {
-        ASSERT_THROW(eval("builtins.match ''[b-a]'' ''b''"), EvalError);
-    }
-
-    TEST_F(PrimOpTest, match_unsupported_syntax12) {
-        ASSERT_THROW(eval("builtins.match ''[[:alpha:]-a]'' ''b''"), EvalError);
-    }
-
-    TEST_F(PrimOpTest, match_unsupported_syntax13) {
-        ASSERT_THROW(eval("builtins.match ''[[=1=]]'' ''1''"), EvalError);
-    }
-
-    TEST_F(PrimOpTest, match_unsupported_syntax14) {
-        ASSERT_THROW(eval("builtins.match ''[[=]=]]'' '']''"), EvalError);
-    }
-
-    TEST_F(PrimOpTest, match_unsupported_syntax15) {
-        ASSERT_THROW(eval("builtins.match ''[a-b-c]'' ''b''"), EvalError);
-    }
-
    TEST_F(PrimOpTest, attrNames) {
        auto v = eval("builtins.attrNames { x = 1; y = 2; z = 3; a = 2; }");
        ASSERT_THAT(v, IsListOfSize(4));