ws: quit if any worker quits

This commit is contained in:
Joakim Frostegård 2024-02-03 22:56:08 +01:00
parent d7e06468c3
commit 93f8995a72
6 changed files with 153 additions and 194 deletions

View file

@ -3,19 +3,15 @@ pub mod config;
pub mod workers;
use std::sync::Arc;
use std::thread::{sleep, Builder, JoinHandle};
use std::time::Duration;
use anyhow::Context;
use aquatic_common::cpu_pinning::glommio::{get_worker_placement, set_affinity_for_util_worker};
use aquatic_common::cpu_pinning::WorkerIndex;
use aquatic_common::rustls_config::create_rustls_config;
use aquatic_common::{PanicSentinelWatcher, ServerStartInstant};
use aquatic_common::{ServerStartInstant, WorkerType};
use arc_swap::ArcSwap;
use glommio::{channels::channel_mesh::MeshBuilder, prelude::*};
use signal_hook::{
consts::{SIGTERM, SIGUSR1},
iterator::Signals,
};
use signal_hook::{consts::SIGUSR1, iterator::Signals};
use aquatic_common::access_list::update_access_list;
use aquatic_common::privileges::PrivilegeDropper;
@ -35,45 +31,18 @@ pub fn run(config: Config) -> ::anyhow::Result<()> {
));
}
let mut signals = Signals::new([SIGUSR1, SIGTERM])?;
#[cfg(feature = "prometheus")]
if config.metrics.run_prometheus_endpoint {
use metrics_exporter_prometheus::PrometheusBuilder;
let idle_timeout = config
.cleaning
.connection_cleaning_interval
.max(config.cleaning.torrent_cleaning_interval)
.max(config.metrics.torrent_count_update_interval)
* 2;
PrometheusBuilder::new()
.idle_timeout(
metrics_util::MetricKindMask::GAUGE,
Some(Duration::from_secs(idle_timeout)),
)
.with_http_listener(config.metrics.prometheus_endpoint_address)
.install()
.with_context(|| {
format!(
"Install prometheus endpoint on {}",
config.metrics.prometheus_endpoint_address
)
})?;
}
let mut signals = Signals::new([SIGUSR1])?;
let state = State::default();
update_access_list(&config.access_list, &state.access_list)?;
let num_peers = config.socket_workers + config.swarm_workers;
let num_mesh_peers = config.socket_workers + config.swarm_workers;
let request_mesh_builder = MeshBuilder::partial(num_peers, SHARED_IN_CHANNEL_SIZE);
let response_mesh_builder = MeshBuilder::partial(num_peers, SHARED_IN_CHANNEL_SIZE * 16);
let control_mesh_builder = MeshBuilder::partial(num_peers, SHARED_IN_CHANNEL_SIZE * 16);
let request_mesh_builder = MeshBuilder::partial(num_mesh_peers, SHARED_IN_CHANNEL_SIZE);
let response_mesh_builder = MeshBuilder::partial(num_mesh_peers, SHARED_IN_CHANNEL_SIZE * 16);
let control_mesh_builder = MeshBuilder::partial(num_mesh_peers, SHARED_IN_CHANNEL_SIZE * 16);
let (sentinel_watcher, sentinel) = PanicSentinelWatcher::create_with_sentinel();
let priv_dropper = PrivilegeDropper::new(config.privileges.clone(), config.socket_workers);
let opt_tls_config = if config.network.enable_tls {
@ -98,10 +67,9 @@ pub fn run(config: Config) -> ::anyhow::Result<()> {
let server_start_instant = ServerStartInstant::new();
let mut executors = Vec::new();
let mut join_handles = Vec::new();
for i in 0..(config.socket_workers) {
let sentinel = sentinel.clone();
let config = config.clone();
let state = state.clone();
let opt_tls_config = opt_tls_config.clone();
@ -110,120 +78,137 @@ pub fn run(config: Config) -> ::anyhow::Result<()> {
let response_mesh_builder = response_mesh_builder.clone();
let priv_dropper = priv_dropper.clone();
let placement = get_worker_placement(
&config.cpu_pinning,
config.socket_workers,
config.swarm_workers,
WorkerIndex::SocketWorker(i),
)?;
let builder = LocalExecutorBuilder::new(placement).name(&format!("socket-{:02}", i + 1));
let executor = builder
.spawn(move || async move {
workers::socket::run_socket_worker(
sentinel,
config,
state,
opt_tls_config,
control_mesh_builder,
request_mesh_builder,
response_mesh_builder,
priv_dropper,
server_start_instant,
i,
)
.await
let handle = Builder::new()
.name(format!("socket-{:02}", i + 1))
.spawn(move || {
LocalExecutorBuilder::default()
.make()
.map_err(|err| anyhow::anyhow!("Spawning executor failed: {:#}", err))?
.run(workers::socket::run_socket_worker(
config,
state,
opt_tls_config,
control_mesh_builder,
request_mesh_builder,
response_mesh_builder,
priv_dropper,
server_start_instant,
i,
))
})
.map_err(|err| anyhow::anyhow!("Spawning executor failed: {:#}", err))?;
.context("spawn socket worker")?;
executors.push(executor);
join_handles.push((WorkerType::Socket(i), handle));
}
::log::info!("spawned socket workers");
for i in 0..(config.swarm_workers) {
let sentinel = sentinel.clone();
let config = config.clone();
let state = state.clone();
let control_mesh_builder = control_mesh_builder.clone();
let request_mesh_builder = request_mesh_builder.clone();
let response_mesh_builder = response_mesh_builder.clone();
let placement = get_worker_placement(
&config.cpu_pinning,
config.socket_workers,
config.swarm_workers,
WorkerIndex::SwarmWorker(i),
)?;
let builder = LocalExecutorBuilder::new(placement).name(&format!("swarm-{:02}", i + 1));
let executor = builder
.spawn(move || async move {
workers::swarm::run_swarm_worker(
sentinel,
config,
state,
control_mesh_builder,
request_mesh_builder,
response_mesh_builder,
server_start_instant,
i,
)
.await
let handle = Builder::new()
.name(format!("socket-{:02}", i + 1))
.spawn(move || {
LocalExecutorBuilder::default()
.make()
.map_err(|err| anyhow::anyhow!("Spawning executor failed: {:#}", err))?
.run(workers::swarm::run_swarm_worker(
config,
state,
control_mesh_builder,
request_mesh_builder,
response_mesh_builder,
server_start_instant,
i,
))
})
.map_err(|err| anyhow::anyhow!("Spawning executor failed: {:#}", err))?;
.context("spawn swarm worker")?;
executors.push(executor);
join_handles.push((WorkerType::Socket(i), handle));
}
::log::info!("spawned swarm workers");
#[cfg(feature = "prometheus")]
if config.metrics.run_prometheus_endpoint {
let idle_timeout = config
.cleaning
.connection_cleaning_interval
.max(config.cleaning.torrent_cleaning_interval)
.max(config.metrics.torrent_count_update_interval)
* 2;
if config.cpu_pinning.active {
set_affinity_for_util_worker(
&config.cpu_pinning,
config.socket_workers,
config.swarm_workers,
let handle = aquatic_common::spawn_prometheus_endpoint(
config.metrics.prometheus_endpoint_address,
Some(Duration::from_secs(idle_timeout)),
)?;
join_handles.push((WorkerType::Prometheus, handle));
}
for signal in &mut signals {
match signal {
SIGUSR1 => {
let _ = update_access_list(&config.access_list, &state.access_list);
// Spawn signal handler thread
{
let handle: JoinHandle<anyhow::Result<()>> = Builder::new()
.name("signals".into())
.spawn(move || {
for signal in &mut signals {
match signal {
SIGUSR1 => {
let _ = update_access_list(&config.access_list, &state.access_list);
if let Some(tls_config) = opt_tls_config.as_ref() {
match ::std::fs::read(&config.network.tls_certificate_path) {
Ok(data) if &data == opt_tls_cert_data.as_ref().unwrap() => {
::log::info!("skipping tls config update: certificate identical to currently loaded");
}
Ok(data) => {
match create_rustls_config(
&config.network.tls_certificate_path,
&config.network.tls_private_key_path,
) {
Ok(config) => {
tls_config.store(Arc::new(config));
opt_tls_cert_data = Some(data);
if let Some(tls_config) = opt_tls_config.as_ref() {
match ::std::fs::read(&config.network.tls_certificate_path) {
Ok(data) if &data == opt_tls_cert_data.as_ref().unwrap() => {
::log::info!("skipping tls config update: certificate identical to currently loaded");
}
Ok(data) => {
match create_rustls_config(
&config.network.tls_certificate_path,
&config.network.tls_private_key_path,
) {
Ok(config) => {
tls_config.store(Arc::new(config));
opt_tls_cert_data = Some(data);
::log::info!("successfully updated tls config");
::log::info!("successfully updated tls config");
}
Err(err) => ::log::error!("could not update tls config: {:#}", err),
}
}
Err(err) => ::log::error!("couldn't read tls certificate file: {:#}", err),
}
Err(err) => ::log::error!("could not update tls config: {:#}", err),
}
}
Err(err) => ::log::error!("couldn't read tls certificate file: {:#}", err),
_ => unreachable!(),
}
}
Ok(())
})
.context("spawn signal worker")?;
join_handles.push((WorkerType::Signals, handle));
}
loop {
for (i, (_, handle)) in join_handles.iter().enumerate() {
if handle.is_finished() {
let (worker_type, handle) = join_handles.remove(i);
match handle.join() {
Ok(Ok(())) => {
return Err(anyhow::anyhow!("{} stopped", worker_type));
}
Ok(Err(err)) => {
return Err(err.context(format!("{} stopped", worker_type)));
}
Err(_) => {
return Err(anyhow::anyhow!("{} panicked", worker_type));
}
}
}
SIGTERM => {
if sentinel_watcher.panic_was_triggered() {
return Err(anyhow::anyhow!("worker thread panicked"));
} else {
return Ok(());
}
}
_ => unreachable!(),
}
}
Ok(())
sleep(Duration::from_secs(5));
}
}