Merge pull request #262839 from RaitoBezarius/qemu-vm/timeout

This commit is contained in:
Ryan Lahfa 2023-10-29 17:21:10 +01:00 committed by GitHub
commit 92fdbd284c
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
8 changed files with 104 additions and 18 deletions

View file

@ -11,6 +11,7 @@
, tesseract4
, vde2
, extraPythonPackages ? (_ : [])
, nixosTests
}:
python3Packages.buildPythonApplication {
@ -31,6 +32,10 @@ python3Packages.buildPythonApplication {
++ (lib.optionals enableOCR [ imagemagick_light tesseract4 ])
++ extraPythonPackages python3Packages;
passthru.tests = {
inherit (nixosTests.nixos-test-driver) driver-timeout;
};
doCheck = true;
nativeCheckInputs = with python3Packages; [ mypy ruff black ];
checkPhase = ''

View file

@ -76,6 +76,14 @@ def main() -> None:
nargs="*",
help="vlans to span by the driver",
)
arg_parser.add_argument(
"--global-timeout",
type=int,
metavar="GLOBAL_TIMEOUT",
action=EnvDefault,
envvar="globalTimeout",
help="Timeout in seconds for the whole test",
)
arg_parser.add_argument(
"-o",
"--output_directory",
@ -103,6 +111,7 @@ def main() -> None:
args.testscript.read_text(),
args.output_directory.resolve(),
args.keep_vm_state,
args.global_timeout,
) as driver:
if args.interactive:
history_dir = os.getcwd()

View file

@ -1,6 +1,8 @@
import os
import re
import signal
import tempfile
import threading
from contextlib import contextmanager
from pathlib import Path
from typing import Any, Callable, ContextManager, Dict, Iterator, List, Optional, Union
@ -41,6 +43,8 @@ class Driver:
vlans: List[VLan]
machines: List[Machine]
polling_conditions: List[PollingCondition]
global_timeout: int
race_timer: threading.Timer
def __init__(
self,
@ -49,9 +53,12 @@ class Driver:
tests: str,
out_dir: Path,
keep_vm_state: bool = False,
global_timeout: int = 24 * 60 * 60 * 7,
):
self.tests = tests
self.out_dir = out_dir
self.global_timeout = global_timeout
self.race_timer = threading.Timer(global_timeout, self.terminate_test)
tmp_dir = get_tmp_dir()
@ -82,6 +89,7 @@ class Driver:
def __exit__(self, *_: Any) -> None:
with rootlog.nested("cleanup"):
self.race_timer.cancel()
for machine in self.machines:
machine.release()
@ -144,6 +152,10 @@ class Driver:
def run_tests(self) -> None:
"""Run the test script (for non-interactive test runs)"""
rootlog.info(
f"Test will time out and terminate in {self.global_timeout} seconds"
)
self.race_timer.start()
self.test_script()
# TODO: Collect coverage data
for machine in self.machines:
@ -161,6 +173,19 @@ class Driver:
with rootlog.nested("wait for all VMs to finish"):
for machine in self.machines:
machine.wait_for_shutdown()
self.race_timer.cancel()
def terminate_test(self) -> None:
# This will be usually running in another thread than
# the thread actually executing the test script.
with rootlog.nested("timeout reached; test terminating..."):
for machine in self.machines:
machine.release()
# As we cannot `sys.exit` from another thread
# We can at least force the main thread to get SIGTERM'ed.
# This will prevent any user who caught all the exceptions
# to swallow them and prevent itself from terminating.
os.kill(os.getpid(), signal.SIGTERM)
def create_machine(self, args: Dict[str, Any]) -> Machine:
tmp_dir = get_tmp_dir()

View file

@ -42,6 +42,7 @@ rec {
, nodes ? {}
, testScript
, enableOCR ? false
, globalTimeout ? (60 * 60)
, name ? "unnamed"
, skipTypeCheck ? false
# Skip linting (mainly intended for faster dev cycles)

View file

@ -94,6 +94,7 @@ let
wrapProgram $out/bin/nixos-test-driver \
--set startScripts "''${vmStartScripts[*]}" \
--set testScript "$out/test-script" \
--set globalTimeout "${toString config.globalTimeout}" \
--set vlans '${toString vlans}' \
${lib.escapeShellArgs (lib.concatMap (arg: ["--add-flags" arg]) config.extraDriverArgs)}
'';
@ -123,6 +124,18 @@ in
defaultText = "hostPkgs.qemu_test";
};
globalTimeout = mkOption {
description = mdDoc ''
A global timeout for the complete test, expressed in seconds.
Beyond that timeout, every resource will be killed and released and the test will fail.
By default, we use a 1 hour timeout.
'';
type = types.int;
default = 60 * 60;
example = 10 * 60;
};
enableOCR = mkOption {
description = mdDoc ''
Whether to enable Optical Character Recognition functionality for

View file

@ -16,6 +16,15 @@ in
'';
};
rawTestDerivation = mkOption {
type = types.package;
description = mdDoc ''
Unfiltered version of `test`, for troubleshooting the test framework and `testBuildFailure` in the test framework's test suite.
This is not intended for general use. Use `test` instead.
'';
internal = true;
};
test = mkOption {
type = types.package;
# TODO: can the interactive driver be configured to access the network?
@ -29,8 +38,7 @@ in
};
config = {
test = lib.lazyDerivation { # lazyDerivation improves performance when only passthru items and/or meta are used.
derivation = hostPkgs.stdenv.mkDerivation {
rawTestDerivation = hostPkgs.stdenv.mkDerivation {
name = "vm-test-run-${config.name}";
requiredSystemFeatures = [ "kvm" "nixos-test" ];
@ -48,6 +56,8 @@ in
meta = config.meta;
};
test = lib.lazyDerivation { # lazyDerivation improves performance when only passthru items and/or meta are used.
derivation = config.rawTestDerivation;
inherit (config) passthru meta;
};

View file

@ -90,6 +90,14 @@ in {
lib-extend = handleTestOn [ "x86_64-linux" "aarch64-linux" ] ./nixos-test-driver/lib-extend.nix {};
node-name = runTest ./nixos-test-driver/node-name.nix;
busybox = runTest ./nixos-test-driver/busybox.nix;
driver-timeout = pkgs.runCommand "ensure-timeout-induced-failure" {
failed = pkgs.testers.testBuildFailure ((runTest ./nixos-test-driver/timeout.nix).config.rawTestDerivation);
} ''
grep -F "timeout reached; test terminating" $failed/testBuildFailure.log
# The program will always be terminated by SIGTERM (143) if it waits for the deadline thread.
[[ 143 = $(cat $failed/testBuildFailure.exit) ]]
touch $out
'';
};
# NixOS vm tests and non-vm unit tests

View file

@ -0,0 +1,15 @@
{
name = "Test that sleep of 6 seconds fails a timeout of 5 seconds";
globalTimeout = 5;
nodes = {
machine = ({ pkgs, ... }: {
});
};
testScript = ''
start_all()
machine.wait_for_unit("multi-user.target")
machine.succeed("sleep 6")
'';
}