Skip to content

Commit 9952c17

Browse files
Addewd retry logic to asap-planner for getting metrics/labels from Prometheus
1 parent 0b93469 commit 9952c17

1 file changed

Lines changed: 70 additions & 42 deletions

File tree

asap-planner-rs/src/prometheus_client.rs

Lines changed: 70 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
use std::collections::HashSet;
2+
use std::thread;
3+
use std::time::Duration;
24

35
use promql_parser::parser::Expr;
46
use promql_utilities::data_model::KeyByLabelNames;
@@ -7,6 +9,11 @@ use tracing::{debug, warn};
79

810
use crate::error::ControllerError;
911

12+
/// Number of times to retry a Prometheus request on 503 (service not yet ready).
13+
const MAX_RETRIES: u32 = 15;
14+
/// Delay between retries.
15+
const RETRY_DELAY: Duration = Duration::from_secs(2);
16+
1017
/// Walk a PromQL AST and collect all metric names referenced by VectorSelectors.
1118
fn collect_metric_names(expr: &Expr, names: &mut HashSet<String>) {
1219
match expr {
@@ -69,65 +76,86 @@ fn fetch_labels_for_metric(
6976
) -> Result<Option<Vec<String>>, ControllerError> {
7077
let url = format!("{}/api/v1/series", prometheus_url.trim_end_matches('/'));
7178
let client = reqwest::blocking::Client::new();
72-
let response = client
73-
.get(&url)
74-
.query(&[("match[]", metric_name)])
75-
.send()
76-
.map_err(|e| {
79+
80+
for attempt in 1..=MAX_RETRIES {
81+
let response = client
82+
.get(&url)
83+
.query(&[("match[]", metric_name)])
84+
.send()
85+
.map_err(|e| {
86+
ControllerError::PrometheusClient(format!(
87+
"HTTP request failed for metric '{}': {}",
88+
metric_name, e
89+
))
90+
})?;
91+
92+
let status = response.status();
93+
94+
if status == reqwest::StatusCode::SERVICE_UNAVAILABLE {
95+
warn!(
96+
"Prometheus returned 503 for metric '{}' (attempt {}/{}); retrying in {}s",
97+
metric_name,
98+
attempt,
99+
MAX_RETRIES,
100+
RETRY_DELAY.as_secs(),
101+
);
102+
thread::sleep(RETRY_DELAY);
103+
continue;
104+
}
105+
106+
if !status.is_success() {
107+
return Err(ControllerError::PrometheusClient(format!(
108+
"Prometheus returned HTTP {} for metric '{}'",
109+
status, metric_name
110+
)));
111+
}
112+
113+
let body: serde_json::Value = response.json().map_err(|e| {
77114
ControllerError::PrometheusClient(format!(
78-
"HTTP request failed for metric '{}': {}",
115+
"Failed to parse Prometheus response for metric '{}': {}",
79116
metric_name, e
80117
))
81118
})?;
82119

83-
if !response.status().is_success() {
84-
return Err(ControllerError::PrometheusClient(format!(
85-
"Prometheus returned HTTP {} for metric '{}'",
86-
response.status(),
87-
metric_name
88-
)));
89-
}
90-
91-
let body: serde_json::Value = response.json().map_err(|e| {
92-
ControllerError::PrometheusClient(format!(
93-
"Failed to parse Prometheus response for metric '{}': {}",
94-
metric_name, e
95-
))
96-
})?;
120+
let data = match body.get("data").and_then(|d| d.as_array()) {
121+
Some(arr) => arr,
122+
None => {
123+
warn!(
124+
"Prometheus returned no 'data' array for metric '{}'; skipping",
125+
metric_name
126+
);
127+
return Ok(None);
128+
}
129+
};
97130

98-
let data = match body.get("data").and_then(|d| d.as_array()) {
99-
Some(arr) => arr,
100-
None => {
131+
if data.is_empty() {
101132
warn!(
102-
"Prometheus returned no 'data' array for metric '{}'; skipping",
133+
"Prometheus returned no series for metric '{}' in the last 5 minutes; skipping",
103134
metric_name
104135
);
105136
return Ok(None);
106137
}
107-
};
108-
109-
if data.is_empty() {
110-
warn!(
111-
"Prometheus returned no series for metric '{}' in the last 5 minutes; skipping",
112-
metric_name
113-
);
114-
return Ok(None);
115-
}
116138

117-
// Collect all unique label key names across all returned series,
118-
// filtering out internal __*__ labels.
119-
let mut label_keys: HashSet<String> = HashSet::new();
120-
for series in data {
121-
if let Some(labels) = series.as_object() {
122-
for key in labels.keys() {
123-
if !key.starts_with("__") {
124-
label_keys.insert(key.clone());
139+
// Collect all unique label key names across all returned series,
140+
// filtering out internal __*__ labels.
141+
let mut label_keys: HashSet<String> = HashSet::new();
142+
for series in data {
143+
if let Some(labels) = series.as_object() {
144+
for key in labels.keys() {
145+
if !key.starts_with("__") {
146+
label_keys.insert(key.clone());
147+
}
125148
}
126149
}
127150
}
151+
152+
return Ok(Some(label_keys.into_iter().collect()));
128153
}
129154

130-
Ok(Some(label_keys.into_iter().collect()))
155+
Err(ControllerError::PrometheusClient(format!(
156+
"Prometheus returned 503 for metric '{}' after {} attempts; giving up",
157+
metric_name, MAX_RETRIES
158+
)))
131159
}
132160

133161
/// Build a `PromQLSchema` by querying Prometheus for each metric name found in the given

0 commit comments

Comments
 (0)