nixpkgs/nixos/tests/consul.nix
Niklas Hambüchen a59a972413 consul.passthru.tests: Fix failure on current consul. Fixes #90613.
Done by setting `autopilot.min_quorum = 3`.

Techncially, this would have been required to keep the test correct since
Consul's "autopilot" "Dead Server Cleanup" was enabled by default (I believe
that was in Consul 0.8). Practically, the issue only occurred with our NixOS
test with releases >= `1.7.0-beta2` (see #90613). The setting itself is
available since Consul 1.6.2.

However, this setting was not documented clearly enough for anybody to notice,
and only the upstream issue https://github.com/hashicorp/consul/issues/8118
I filed brought that to light.

As explained there, the test could also have been made pass by applying the
more correct rolling reboot procedure

    -m.wait_until_succeeds("[ $(consul members | grep -o alive | wc -l) == 5 ]")
    +m.wait_until_succeeds(
    +    "[ $(consul operator raft list-peers | grep true | wc -l) == 3 ]"
    +)

but we also intend to test that Consul can regain consensus even if
the quorum gets temporarily broken.
2020-06-18 02:22:31 +02:00

146 lines
4.5 KiB
Nix

import ./make-test-python.nix ({pkgs, lib, ...}:
let
# Settings for both servers and agents
webUi = true;
retry_interval = "1s";
raft_multiplier = 1;
defaultExtraConfig = {
inherit retry_interval;
performance = {
inherit raft_multiplier;
};
};
allConsensusServerHosts = [
"192.168.1.1"
"192.168.1.2"
"192.168.1.3"
];
allConsensusClientHosts = [
"192.168.2.1"
"192.168.2.2"
];
firewallSettings = {
# See https://www.consul.io/docs/install/ports.html
allowedTCPPorts = [ 8301 8302 8600 8500 8300 ];
allowedUDPPorts = [ 8301 8302 8600 ];
};
client = index: { pkgs, ... }:
let
ip = builtins.elemAt allConsensusClientHosts index;
in
{
environment.systemPackages = [ pkgs.consul ];
networking.interfaces.eth1.ipv4.addresses = pkgs.lib.mkOverride 0 [
{ address = ip; prefixLength = 16; }
];
networking.firewall = firewallSettings;
services.consul = {
enable = true;
inherit webUi;
extraConfig = defaultExtraConfig // {
server = false;
retry_join = allConsensusServerHosts;
bind_addr = ip;
};
};
};
server = index: { pkgs, ... }:
let
numConsensusServers = builtins.length allConsensusServerHosts;
thisConsensusServerHost = builtins.elemAt allConsensusServerHosts index;
ip = thisConsensusServerHost; # since we already use IPs to identify servers
in
{
networking.interfaces.eth1.ipv4.addresses = pkgs.lib.mkOverride 0 [
{ address = ip; prefixLength = 16; }
];
networking.firewall = firewallSettings;
services.consul =
assert builtins.elem thisConsensusServerHost allConsensusServerHosts;
{
enable = true;
inherit webUi;
extraConfig = defaultExtraConfig // {
server = true;
bootstrap_expect = numConsensusServers;
# Tell Consul that we never intend to drop below this many servers.
# Ensures to not permanently lose consensus after temporary loss.
# See https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040
autopilot.min_quorum = numConsensusServers;
retry_join =
# If there's only 1 node in the network, we allow self-join;
# otherwise, the node must not try to join itself, and join only the other servers.
# See https://github.com/hashicorp/consul/issues/2868
if numConsensusServers == 1
then allConsensusServerHosts
else builtins.filter (h: h != thisConsensusServerHost) allConsensusServerHosts;
bind_addr = ip;
};
};
};
in {
name = "consul";
nodes = {
server1 = server 0;
server2 = server 1;
server3 = server 2;
client1 = client 0;
client2 = client 1;
};
testScript = ''
servers = [server1, server2, server3]
machines = [server1, server2, server3, client1, client2]
for m in machines:
m.wait_for_unit("consul.service")
for m in machines:
m.wait_until_succeeds("[ $(consul members | grep -o alive | wc -l) == 5 ]")
client1.succeed("consul kv put testkey 42")
client2.succeed("[ $(consul kv get testkey) == 42 ]")
# Test that the cluster can tolearate failures of any single server:
for server in servers:
server.crash()
# For each client, wait until they have connection again
# using `kv get -recurse` before issuing commands.
client1.wait_until_succeeds("consul kv get -recurse")
client2.wait_until_succeeds("consul kv get -recurse")
# Do some consul actions while one server is down.
client1.succeed("consul kv put testkey 43")
client2.succeed("[ $(consul kv get testkey) == 43 ]")
client2.succeed("consul kv delete testkey")
# Restart crashed machine.
server.start()
# Wait for recovery.
for m in machines:
m.wait_until_succeeds("[ $(consul members | grep -o alive | wc -l) == 5 ]")
# Wait for client connections.
client1.wait_until_succeeds("consul kv get -recurse")
client2.wait_until_succeeds("consul kv get -recurse")
# Do some consul actions with server back up.
client1.succeed("consul kv put testkey 44")
client2.succeed("[ $(consul kv get testkey) == 44 ]")
client2.succeed("consul kv delete testkey")
'';
})