nixpkgs/nixos/modules/virtualisation/containers.nix
Arnold Krille 9045a8e24c declarative containers: additional veths
With these changes, a container can have more then one veth-pair. This allows for example to have LAN and DMZ as bridges on the host and add dedicated containers for proxies, ipv4-firewall and ipv6-firewall. Or to have a bridge for normal WAN, one bridge for administration and one bridge for customer-internal communication. So that web-server containers can be reached from outside per http, from the management via ssh and can talk to their database via the customer network.

The scripts to set up the containers are now rendered several times instead of just one template. The scripts now contain per-container code to configure the extra veth interfaces. The default template without support for extra-veths is still rendered for the imperative containers.

Also a test is there to see if extra veths can be placed into host-bridges or can be reached via routing.
2016-07-28 23:06:41 +02:00

584 lines
19 KiB
Nix

{ config, lib, pkgs, ... }:
with lib;
let
# The container's init script, a small wrapper around the regular
# NixOS stage-2 init script.
containerInit = (cfg:
let
renderExtraVeth = (name: cfg:
''
echo "Bringing ${name} up"
ip link set dev ${name} up
${optionalString (cfg . "localAddress" or null != null) ''
echo "Setting ip for ${name}"
ip addr add ${cfg . "localAddress"} dev ${name}
''}
${optionalString (cfg . "localAddress6" or null != null) ''
echo "Setting ip6 for ${name}"
ip -6 addr add ${cfg . "localAddress6"} dev ${name}
''}
${optionalString (cfg . "hostAddress" or null != null) ''
echo "Setting route to host for ${name}"
ip route add ${cfg . "hostAddress"} dev ${name}
''}
${optionalString (cfg . "hostAddress6" or null != null) ''
echo "Setting route6 to host for ${name}"
ip -6 route add ${cfg . "hostAddress6"} dev ${name}
''}
''
);
in
pkgs.writeScript "container-init"
''
#! ${pkgs.stdenv.shell} -e
# Initialise the container side of the veth pair.
if [ "$PRIVATE_NETWORK" = 1 ]; then
ip link set host0 name eth0
ip link set dev eth0 up
if [ -n "$LOCAL_ADDRESS" ]; then
ip addr add $LOCAL_ADDRESS dev eth0
fi
if [ -n "$LOCAL_ADDRESS6" ]; then
ip -6 addr add $LOCAL_ADDRESS6 dev eth0
fi
if [ -n "$HOST_ADDRESS" ]; then
ip route add $HOST_ADDRESS dev eth0
ip route add default via $HOST_ADDRESS
fi
if [ -n "$HOST_ADDRESS6" ]; then
ip -6 route add $HOST_ADDRESS6 dev eth0
ip -6 route add default via $HOST_ADDRESS6
fi
${concatStringsSep "\n" (mapAttrsToList renderExtraVeth cfg . "extraVeths" or {})}
ip a
ip r
fi
# Start the regular stage 1 script.
exec "$1"
''
);
nspawnExtraVethArgs = (name: cfg: "--network-veth-extra=${name}");
startScript = (cfg:
''
mkdir -p -m 0755 "$root/etc" "$root/var/lib"
mkdir -p -m 0700 "$root/var/lib/private" "$root/root" /run/containers
if ! [ -e "$root/etc/os-release" ]; then
touch "$root/etc/os-release"
fi
if ! [ -e "$root/etc/machine-id" ]; then
touch "$root/etc/machine-id"
fi
mkdir -p -m 0755 \
"/nix/var/nix/profiles/per-container/$INSTANCE" \
"/nix/var/nix/gcroots/per-container/$INSTANCE"
cp --remove-destination /etc/resolv.conf "$root/etc/resolv.conf"
if [ "$PRIVATE_NETWORK" = 1 ]; then
extraFlags+=" --network-veth"
if [ -n "$HOST_BRIDGE" ]; then
extraFlags+=" --network-bridge=$HOST_BRIDGE"
fi
fi
${if cfg . "extraVeths" or null != null then
''extraFlags+=" ${concatStringsSep " " (mapAttrsToList nspawnExtraVethArgs cfg . "extraVeths" or {})}"''
else
''# No extra veth pairs to create''
}
for iface in $INTERFACES; do
extraFlags+=" --network-interface=$iface"
done
for iface in $MACVLANS; do
extraFlags+=" --network-macvlan=$iface"
done
# If the host is 64-bit and the container is 32-bit, add a
# --personality flag.
${optionalString (config.nixpkgs.system == "x86_64-linux") ''
if [ "$(< ''${SYSTEM_PATH:-/nix/var/nix/profiles/per-container/$INSTANCE/system}/system)" = i686-linux ]; then
extraFlags+=" --personality=x86"
fi
''}
# Run systemd-nspawn without startup notification (we'll
# wait for the container systemd to signal readiness).
EXIT_ON_REBOOT=1 \
exec ${config.systemd.package}/bin/systemd-nspawn \
--keep-unit \
-M "$INSTANCE" -D "$root" $extraFlags \
$EXTRA_NSPAWN_FLAGS \
--notify-ready=yes \
--bind-ro=/nix/store \
--bind-ro=/nix/var/nix/db \
--bind-ro=/nix/var/nix/daemon-socket \
--bind="/nix/var/nix/profiles/per-container/$INSTANCE:/nix/var/nix/profiles" \
--bind="/nix/var/nix/gcroots/per-container/$INSTANCE:/nix/var/nix/gcroots" \
--setenv PRIVATE_NETWORK="$PRIVATE_NETWORK" \
--setenv HOST_BRIDGE="$HOST_BRIDGE" \
--setenv HOST_ADDRESS="$HOST_ADDRESS" \
--setenv LOCAL_ADDRESS="$LOCAL_ADDRESS" \
--setenv HOST_ADDRESS6="$HOST_ADDRESS6" \
--setenv LOCAL_ADDRESS6="$LOCAL_ADDRESS6" \
--setenv PATH="$PATH" \
${containerInit cfg} "''${SYSTEM_PATH:-/nix/var/nix/profiles/system}/init"
''
);
preStartScript = (cfg:
''
# Clean up existing machined registration and interfaces.
machinectl terminate "$INSTANCE" 2> /dev/null || true
if [ "$PRIVATE_NETWORK" = 1 ]; then
ip link del dev "ve-$INSTANCE" 2> /dev/null || true
ip link del dev "vb-$INSTANCE" 2> /dev/null || true
fi
${concatStringsSep "\n" (
mapAttrsToList (name: cfg:
''ip link del dev ${name} 2> /dev/null || true ''
) cfg . "extraVeths" or {}
)}
''
);
postStartScript = (cfg:
let
ipcall = (cfg: ipcmd: variable: attribute:
if cfg . attribute or null == null then
''
if [ -n "${variable}" ]; then
${ipcmd} add ${variable} dev $ifaceHost
fi
''
else
''${ipcmd} add ${cfg . attribute} dev $ifaceHost''
);
renderExtraVeth = (name: cfg:
if cfg . "hostBridge" or null != null then
''
# Add ${name} to bridge ${cfg.hostBridge}
ip link set dev ${name} master ${cfg.hostBridge} up
''
else
''
# Set IPs and routes for ${name}
${optionalString (cfg . "hostAddress" or null != null) ''
ip addr add ${cfg . "hostAddress"} dev ${name}
''}
${optionalString (cfg . "hostAddress6" or null != null) ''
ip -6 addr add ${cfg . "hostAddress6"} dev ${name}
''}
${optionalString (cfg . "localAddress" or null != null) ''
ip route add ${cfg . "localAddress"} dev ${name}
''}
${optionalString (cfg . "localAddress6" or null != null) ''
ip -6 route add ${cfg . "localAddress6"} dev ${name}
''}
''
);
in
''
if [ "$PRIVATE_NETWORK" = 1 ]; then
if [ -z "$HOST_BRIDGE" ]; then
ifaceHost=ve-$INSTANCE
ip link set dev $ifaceHost up
${ipcall cfg "ip addr" "$HOST_ADDRESS" "hostAddress"}
${ipcall cfg "ip -6 addr" "$HOST_ADDRESS6" "hostAddress6"}
${ipcall cfg "ip route" "$LOCAL_ADDRESS" "localAddress"}
${ipcall cfg "ip -6 route" "$LOCAL_ADDRESS6" "localAddress6"}
fi
${concatStringsSep "\n" (mapAttrsToList renderExtraVeth cfg . "extraVeths" or {})}
fi
# Get the leader PID so that we can signal it in
# preStop. We can't use machinectl there because D-Bus
# might be shutting down. FIXME: in systemd 219 we can
# just signal systemd-nspawn to do a clean shutdown.
machinectl show "$INSTANCE" | sed 's/Leader=\(.*\)/\1/;t;d' > "/run/containers/$INSTANCE.pid"
''
);
system = config.nixpkgs.system;
bindMountOpts = { name, config, ... }: {
options = {
mountPoint = mkOption {
example = "/mnt/usb";
type = types.str;
description = "Mount point on the container file system.";
};
hostPath = mkOption {
default = null;
example = "/home/alice";
type = types.nullOr types.str;
description = "Location of the host path to be mounted.";
};
isReadOnly = mkOption {
default = true;
example = true;
type = types.bool;
description = "Determine whether the mounted path will be accessed in read-only mode.";
};
};
config = {
mountPoint = mkDefault name;
};
};
mkBindFlag = d:
let flagPrefix = if d.isReadOnly then " --bind-ro=" else " --bind=";
mountstr = if d.hostPath != null then "${d.hostPath}:${d.mountPoint}" else "${d.mountPoint}";
in flagPrefix + mountstr ;
mkBindFlags = bs: concatMapStrings mkBindFlag (lib.attrValues bs);
networkOptions = {
hostBridge = mkOption {
type = types.nullOr types.string;
default = null;
example = "br0";
description = ''
Put the host-side of the veth-pair into the named bridge.
Only one of hostAddress* or hostBridge can be given.
'';
};
hostAddress = mkOption {
type = types.nullOr types.str;
default = null;
example = "10.231.136.1";
description = ''
The IPv4 address assigned to the host interface.
(Not used when hostBridge is set.)
'';
};
hostAddress6 = mkOption {
type = types.nullOr types.string;
default = null;
example = "fc00::1";
description = ''
The IPv6 address assigned to the host interface.
(Not used when hostBridge is set.)
'';
};
localAddress = mkOption {
type = types.nullOr types.str;
default = null;
example = "10.231.136.2";
description = ''
The IPv4 address assigned to the interface in the container.
If a hostBridge is used, this should be given with netmask to access
the whole network. Otherwise the default netmask is /32 and routing is
set up from localAddress to hostAddress and back.
'';
};
localAddress6 = mkOption {
type = types.nullOr types.string;
default = null;
example = "fc00::2";
description = ''
The IPv6 address assigned to the interface in the container.
If a hostBridge is used, this should be given with netmask to access
the whole network. Otherwise the default netmask is /128 and routing is
set up from localAddress6 to hostAddress6 and back.
'';
};
};
in
{
options = {
boot.isContainer = mkOption {
type = types.bool;
default = false;
description = ''
Whether this NixOS machine is a lightweight container running
in another NixOS system.
'';
};
boot.enableContainers = mkOption {
type = types.bool;
default = !config.boot.isContainer;
description = ''
Whether to enable support for nixos containers.
'';
};
containers = mkOption {
type = types.attrsOf (types.submodule (
{ config, options, name, ... }:
{
options = {
config = mkOption {
description = ''
A specification of the desired configuration of this
container, as a NixOS module.
'';
};
path = mkOption {
type = types.path;
example = "/nix/var/nix/profiles/containers/webserver";
description = ''
As an alternative to specifying
<option>config</option>, you can specify the path to
the evaluated NixOS system configuration, typically a
symlink to a system profile.
'';
};
privateNetwork = mkOption {
type = types.bool;
default = false;
description = ''
Whether to give the container its own private virtual
Ethernet interface. The interface is called
<literal>eth0</literal>, and is hooked up to the interface
<literal>ve-<replaceable>container-name</replaceable></literal>
on the host. If this option is not set, then the
container shares the network interfaces of the host,
and can bind to any port on any interface.
'';
};
interfaces = mkOption {
type = types.listOf types.string;
default = [];
example = [ "eth1" "eth2" ];
description = ''
The list of interfaces to be moved into the container.
'';
};
extraVeths = mkOption {
type = types.attrsOf types.optionSet;
default = {};
options = networkOptions;
description = ''
Extra veth-pairs to be created for the container
'';
};
autoStart = mkOption {
type = types.bool;
default = false;
description = ''
Wether the container is automatically started at boot-time.
'';
};
bindMounts = mkOption {
type = types.loaOf types.optionSet;
options = [ bindMountOpts ];
default = {};
example = { "/home" = { hostPath = "/home/alice";
isReadOnly = false; };
};
description =
''
An extra list of directories that is bound to the container.
'';
};
} // networkOptions;
config = mkMerge
[ (mkIf options.config.isDefined {
path = (import ../../lib/eval-config.nix {
inherit system;
modules =
let extraConfig =
{ boot.isContainer = true;
networking.hostName = mkDefault name;
networking.useDHCP = false;
};
in [ extraConfig config.config ];
prefix = [ "containers" name ];
}).config.system.build.toplevel;
})
];
}));
default = {};
example = literalExample
''
{ webserver =
{ path = "/nix/var/nix/profiles/webserver";
};
database =
{ config =
{ config, pkgs, ... }:
{ services.postgresql.enable = true;
services.postgresql.package = pkgs.postgresql92;
};
};
}
'';
description = ''
A set of NixOS system configurations to be run as lightweight
containers. Each container appears as a service
<literal>container-<replaceable>name</replaceable></literal>
on the host system, allowing it to be started and stopped via
<command>systemctl</command> .
'';
};
};
config = mkIf (config.boot.enableContainers) (let
unit = {
description = "Container '%i'";
unitConfig.RequiresMountsFor = [ "/var/lib/containers/%i" ];
path = [ pkgs.iproute ];
environment.INSTANCE = "%i";
environment.root = "/var/lib/containers/%i";
preStart = preStartScript {};
script = startScript {};
postStart = postStartScript {};
preStop =
''
pid="$(cat /run/containers/$INSTANCE.pid)"
if [ -n "$pid" ]; then
kill -RTMIN+4 "$pid"
fi
rm -f "/run/containers/$INSTANCE.pid"
'';
restartIfChanged = false;
serviceConfig = {
ExecReload = pkgs.writeScript "reload-container"
''
#! ${pkgs.stdenv.shell} -e
${pkgs.nixos-container}/bin/nixos-container run "$INSTANCE" -- \
bash --login -c "''${SYSTEM_PATH:-/nix/var/nix/profiles/system}/bin/switch-to-configuration test"
'';
SyslogIdentifier = "container %i";
EnvironmentFile = "-/etc/containers/%i.conf";
Type = "notify";
# Note that on reboot, systemd-nspawn returns 133, so this
# unit will be restarted. On poweroff, it returns 0, so the
# unit won't be restarted.
RestartForceExitStatus = "133";
SuccessExitStatus = "133";
Restart = "on-failure";
# Hack: we don't want to kill systemd-nspawn, since we call
# "machinectl poweroff" in preStop to shut down the
# container cleanly. But systemd requires sending a signal
# (at least if we want remaining processes to be killed
# after the timeout). So send an ignored signal.
KillMode = "mixed";
KillSignal = "WINCH";
DevicePolicy = "closed";
};
};
in {
systemd.services = listToAttrs (filter (x: x.value != null) (
# The generic container template used by imperative containers
[{ name = "container@"; value = unit; }]
# declarative containers
++ (mapAttrsToList (name: cfg: nameValuePair "container@${name}" (
unit // {
preStart = preStartScript cfg;
script = startScript cfg;
postStart = postStartScript cfg;
} // (
if cfg.autoStart then
{
wantedBy = [ "multi-user.target" ];
wants = [ "network.target" ];
after = [ "network.target" ];
restartTriggers = [ cfg.path ];
reloadIfChanged = true;
}
else {})
)) config.containers)
));
# Generate a configuration file in /etc/containers for each
# container so that container@.target can get the container
# configuration.
environment.etc = mapAttrs' (name: cfg: nameValuePair "containers/${name}.conf"
{ text =
''
SYSTEM_PATH=${cfg.path}
${optionalString cfg.privateNetwork ''
PRIVATE_NETWORK=1
${optionalString (cfg.hostBridge != null) ''
HOST_BRIDGE=${cfg.hostBridge}
''}
${optionalString (cfg.hostAddress != null) ''
HOST_ADDRESS=${cfg.hostAddress}
''}
${optionalString (cfg.hostAddress6 != null) ''
HOST_ADDRESS6=${cfg.hostAddress6}
''}
${optionalString (cfg.localAddress != null) ''
LOCAL_ADDRESS=${cfg.localAddress}
''}
${optionalString (cfg.localAddress6 != null) ''
LOCAL_ADDRESS6=${cfg.localAddress6}
''}
''}
INTERFACES="${toString cfg.interfaces}"
${optionalString cfg.autoStart ''
AUTO_START=1
''}
EXTRA_NSPAWN_FLAGS="${mkBindFlags cfg.bindMounts}"
'';
}) config.containers;
# Generate /etc/hosts entries for the containers.
networking.extraHosts = concatStrings (mapAttrsToList (name: cfg: optionalString (cfg.localAddress != null)
''
${cfg.localAddress} ${name}.containers
'') config.containers);
networking.dhcpcd.denyInterfaces = [ "ve-*" ];
environment.systemPackages = [ pkgs.nixos-container ];
});
}