a59a972413
Done by setting `autopilot.min_quorum = 3`. Techncially, this would have been required to keep the test correct since Consul's "autopilot" "Dead Server Cleanup" was enabled by default (I believe that was in Consul 0.8). Practically, the issue only occurred with our NixOS test with releases >= `1.7.0-beta2` (see #90613). The setting itself is available since Consul 1.6.2. However, this setting was not documented clearly enough for anybody to notice, and only the upstream issue https://github.com/hashicorp/consul/issues/8118 I filed brought that to light. As explained there, the test could also have been made pass by applying the more correct rolling reboot procedure -m.wait_until_succeeds("[ $(consul members | grep -o alive | wc -l) == 5 ]") +m.wait_until_succeeds( + "[ $(consul operator raft list-peers | grep true | wc -l) == 3 ]" +) but we also intend to test that Consul can regain consensus even if the quorum gets temporarily broken.
146 lines
4.5 KiB
Nix
146 lines
4.5 KiB
Nix
import ./make-test-python.nix ({pkgs, lib, ...}:
|
|
|
|
let
|
|
# Settings for both servers and agents
|
|
webUi = true;
|
|
retry_interval = "1s";
|
|
raft_multiplier = 1;
|
|
|
|
defaultExtraConfig = {
|
|
inherit retry_interval;
|
|
performance = {
|
|
inherit raft_multiplier;
|
|
};
|
|
};
|
|
|
|
allConsensusServerHosts = [
|
|
"192.168.1.1"
|
|
"192.168.1.2"
|
|
"192.168.1.3"
|
|
];
|
|
|
|
allConsensusClientHosts = [
|
|
"192.168.2.1"
|
|
"192.168.2.2"
|
|
];
|
|
|
|
firewallSettings = {
|
|
# See https://www.consul.io/docs/install/ports.html
|
|
allowedTCPPorts = [ 8301 8302 8600 8500 8300 ];
|
|
allowedUDPPorts = [ 8301 8302 8600 ];
|
|
};
|
|
|
|
client = index: { pkgs, ... }:
|
|
let
|
|
ip = builtins.elemAt allConsensusClientHosts index;
|
|
in
|
|
{
|
|
environment.systemPackages = [ pkgs.consul ];
|
|
|
|
networking.interfaces.eth1.ipv4.addresses = pkgs.lib.mkOverride 0 [
|
|
{ address = ip; prefixLength = 16; }
|
|
];
|
|
networking.firewall = firewallSettings;
|
|
|
|
services.consul = {
|
|
enable = true;
|
|
inherit webUi;
|
|
extraConfig = defaultExtraConfig // {
|
|
server = false;
|
|
retry_join = allConsensusServerHosts;
|
|
bind_addr = ip;
|
|
};
|
|
};
|
|
};
|
|
|
|
server = index: { pkgs, ... }:
|
|
let
|
|
numConsensusServers = builtins.length allConsensusServerHosts;
|
|
thisConsensusServerHost = builtins.elemAt allConsensusServerHosts index;
|
|
ip = thisConsensusServerHost; # since we already use IPs to identify servers
|
|
in
|
|
{
|
|
networking.interfaces.eth1.ipv4.addresses = pkgs.lib.mkOverride 0 [
|
|
{ address = ip; prefixLength = 16; }
|
|
];
|
|
networking.firewall = firewallSettings;
|
|
|
|
services.consul =
|
|
assert builtins.elem thisConsensusServerHost allConsensusServerHosts;
|
|
{
|
|
enable = true;
|
|
inherit webUi;
|
|
extraConfig = defaultExtraConfig // {
|
|
server = true;
|
|
bootstrap_expect = numConsensusServers;
|
|
# Tell Consul that we never intend to drop below this many servers.
|
|
# Ensures to not permanently lose consensus after temporary loss.
|
|
# See https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040
|
|
autopilot.min_quorum = numConsensusServers;
|
|
retry_join =
|
|
# If there's only 1 node in the network, we allow self-join;
|
|
# otherwise, the node must not try to join itself, and join only the other servers.
|
|
# See https://github.com/hashicorp/consul/issues/2868
|
|
if numConsensusServers == 1
|
|
then allConsensusServerHosts
|
|
else builtins.filter (h: h != thisConsensusServerHost) allConsensusServerHosts;
|
|
bind_addr = ip;
|
|
};
|
|
};
|
|
};
|
|
in {
|
|
name = "consul";
|
|
|
|
nodes = {
|
|
server1 = server 0;
|
|
server2 = server 1;
|
|
server3 = server 2;
|
|
|
|
client1 = client 0;
|
|
client2 = client 1;
|
|
};
|
|
|
|
testScript = ''
|
|
servers = [server1, server2, server3]
|
|
machines = [server1, server2, server3, client1, client2]
|
|
|
|
for m in machines:
|
|
m.wait_for_unit("consul.service")
|
|
|
|
for m in machines:
|
|
m.wait_until_succeeds("[ $(consul members | grep -o alive | wc -l) == 5 ]")
|
|
|
|
client1.succeed("consul kv put testkey 42")
|
|
client2.succeed("[ $(consul kv get testkey) == 42 ]")
|
|
|
|
# Test that the cluster can tolearate failures of any single server:
|
|
for server in servers:
|
|
server.crash()
|
|
|
|
# For each client, wait until they have connection again
|
|
# using `kv get -recurse` before issuing commands.
|
|
client1.wait_until_succeeds("consul kv get -recurse")
|
|
client2.wait_until_succeeds("consul kv get -recurse")
|
|
|
|
# Do some consul actions while one server is down.
|
|
client1.succeed("consul kv put testkey 43")
|
|
client2.succeed("[ $(consul kv get testkey) == 43 ]")
|
|
client2.succeed("consul kv delete testkey")
|
|
|
|
# Restart crashed machine.
|
|
server.start()
|
|
|
|
# Wait for recovery.
|
|
for m in machines:
|
|
m.wait_until_succeeds("[ $(consul members | grep -o alive | wc -l) == 5 ]")
|
|
|
|
# Wait for client connections.
|
|
client1.wait_until_succeeds("consul kv get -recurse")
|
|
client2.wait_until_succeeds("consul kv get -recurse")
|
|
|
|
# Do some consul actions with server back up.
|
|
client1.succeed("consul kv put testkey 44")
|
|
client2.succeed("[ $(consul kv get testkey) == 44 ]")
|
|
client2.succeed("consul kv delete testkey")
|
|
'';
|
|
})
|