@@ -35,8 +35,8 @@ use stackable_operator::{
3535 api:: {
3636 apps:: v1:: { StatefulSet , StatefulSetSpec } ,
3737 core:: v1:: {
38- ConfigMap , ConfigMapVolumeSource , ContainerPort , EnvVar , EnvVarSource , Probe ,
39- Secret , SecretKeySelector , Service , ServicePort , ServiceSpec , TCPSocketAction ,
38+ ConfigMap , ConfigMapVolumeSource , ContainerPort , EnvVar , EnvVarSource , ExecAction ,
39+ HTTPGetAction , Probe , Secret , SecretKeySelector , Service , ServicePort , ServiceSpec ,
4040 Volume ,
4141 } ,
4242 } ,
@@ -1061,6 +1061,8 @@ fn build_rolegroup_statefulset(
10611061 . context ( AddVolumeMountSnafu ) ?
10621062 . add_container_ports ( container_ports ( trino) )
10631063 . resources ( merged_config. resources . clone ( ) . into ( ) )
1064+ // The probes are set on coordinators and workers
1065+ . startup_probe ( startup_probe ( trino) )
10641066 . readiness_probe ( readiness_probe ( trino) )
10651067 . liveness_probe ( liveness_probe ( trino) )
10661068 . build ( ) ;
@@ -1484,40 +1486,74 @@ fn container_ports(trino: &v1alpha1::TrinoCluster) -> Vec<ContainerPort> {
14841486 ports
14851487}
14861488
1487- fn readiness_probe ( trino : & v1alpha1:: TrinoCluster ) -> Probe {
1488- let port_name = if trino. expose_https_port ( ) {
1489- HTTPS_PORT_NAME
1490- } else {
1491- HTTP_PORT_NAME
1492- } ;
1489+ fn startup_probe ( trino : & v1alpha1:: TrinoCluster ) -> Probe {
1490+ Probe {
1491+ exec : Some ( finished_starting_probe ( trino) ) ,
1492+ period_seconds : Some ( 5 ) ,
1493+ // Give the coordinator or worker 10 minutes to start up
1494+ failure_threshold : Some ( 120 ) ,
1495+ timeout_seconds : Some ( 3 ) ,
1496+ ..Default :: default ( )
1497+ }
1498+ }
14931499
1500+ fn readiness_probe ( trino : & v1alpha1:: TrinoCluster ) -> Probe {
14941501 Probe {
1495- initial_delay_seconds : Some ( 10 ) ,
1496- period_seconds : Some ( 10 ) ,
1497- failure_threshold : Some ( 5 ) ,
1498- tcp_socket : Some ( TCPSocketAction {
1499- port : IntOrString :: String ( port_name. to_string ( ) ) ,
1500- ..TCPSocketAction :: default ( )
1501- } ) ,
1502+ http_get : Some ( http_get_probe ( trino) ) ,
1503+ period_seconds : Some ( 5 ) ,
1504+ failure_threshold : Some ( 1 ) ,
1505+ timeout_seconds : Some ( 3 ) ,
15021506 ..Probe :: default ( )
15031507 }
15041508}
15051509
15061510fn liveness_probe ( trino : & v1alpha1:: TrinoCluster ) -> Probe {
1507- let port_name = if trino. expose_https_port ( ) {
1508- HTTPS_PORT_NAME
1511+ Probe {
1512+ http_get : Some ( http_get_probe ( trino) ) ,
1513+ period_seconds : Some ( 5 ) ,
1514+ // Coordinators are currently not highly available, so you always have a singe instance.
1515+ // Restarting it causes all queries to fail, so let's not restart it directly after the first
1516+ // probe failure, but wait for 3 failures
1517+ // NOTE: This also applies to workers
1518+ failure_threshold : Some ( 3 ) ,
1519+ timeout_seconds : Some ( 3 ) ,
1520+ ..Probe :: default ( )
1521+ }
1522+ }
1523+
1524+ /// Check that `/v1/info` returns `200`.
1525+ ///
1526+ /// This is the same probe as the [upstream helm-chart](https://github.com/trinodb/charts/blob/7cd0a7bff6c52e0ee6ca6d5394cd72c150ad4379/charts/trino/templates/deployment-coordinator.yaml#L214)
1527+ /// is using.
1528+ fn http_get_probe ( trino : & v1alpha1:: TrinoCluster ) -> HTTPGetAction {
1529+ let ( schema, port_name) = if trino. expose_https_port ( ) {
1530+ ( "HTTPS" , HTTPS_PORT_NAME )
15091531 } else {
1510- HTTP_PORT_NAME
1532+ ( "HTTP" , HTTP_PORT_NAME )
15111533 } ;
15121534
1513- Probe {
1514- initial_delay_seconds : Some ( 30 ) ,
1515- period_seconds : Some ( 10 ) ,
1516- tcp_socket : Some ( TCPSocketAction {
1517- port : IntOrString :: String ( port_name. to_string ( ) ) ,
1518- ..TCPSocketAction :: default ( )
1519- } ) ,
1520- ..Probe :: default ( )
1535+ HTTPGetAction {
1536+ port : IntOrString :: String ( port_name. to_string ( ) ) ,
1537+ scheme : Some ( schema. to_string ( ) ) ,
1538+ path : Some ( "/v1/info" . to_string ( ) ) ,
1539+ ..Default :: default ( )
1540+ }
1541+ }
1542+
1543+ /// Wait until `/v1/info` returns `"starting":false`.
1544+ ///
1545+ /// This probe works on coordinators and workers.
1546+ fn finished_starting_probe ( trino : & v1alpha1:: TrinoCluster ) -> ExecAction {
1547+ let port = trino. exposed_port ( ) ;
1548+ ExecAction {
1549+ command : Some ( vec ! [
1550+ "/bin/bash" . to_string( ) ,
1551+ "-x" . to_string( ) ,
1552+ "-euo" . to_string( ) ,
1553+ "pipefail" . to_string( ) ,
1554+ "-c" . to_string( ) ,
1555+ format!( "curl --fail --insecure https://127.0.0.1:{port}/v1/info | grep --silent '\\ \" starting\\ \" :false'" ) ,
1556+ ] ) ,
15211557 }
15221558}
15231559
0 commit comments