11use std:: collections:: HashSet ;
2+ use std:: thread;
3+ use std:: time:: Duration ;
24
35use promql_parser:: parser:: Expr ;
46use promql_utilities:: data_model:: KeyByLabelNames ;
@@ -7,6 +9,11 @@ use tracing::{debug, warn};
79
810use crate :: error:: ControllerError ;
911
12+ /// Number of times to retry a Prometheus request on 503 (service not yet ready).
13+ const MAX_RETRIES : u32 = 15 ;
14+ /// Delay between retries.
15+ const RETRY_DELAY : Duration = Duration :: from_secs ( 2 ) ;
16+
1017/// Walk a PromQL AST and collect all metric names referenced by VectorSelectors.
1118fn collect_metric_names ( expr : & Expr , names : & mut HashSet < String > ) {
1219 match expr {
@@ -69,65 +76,86 @@ fn fetch_labels_for_metric(
6976) -> Result < Option < Vec < String > > , ControllerError > {
7077 let url = format ! ( "{}/api/v1/series" , prometheus_url. trim_end_matches( '/' ) ) ;
7178 let client = reqwest:: blocking:: Client :: new ( ) ;
72- let response = client
73- . get ( & url)
74- . query ( & [ ( "match[]" , metric_name) ] )
75- . send ( )
76- . map_err ( |e| {
79+
80+ for attempt in 1 ..=MAX_RETRIES {
81+ let response = client
82+ . get ( & url)
83+ . query ( & [ ( "match[]" , metric_name) ] )
84+ . send ( )
85+ . map_err ( |e| {
86+ ControllerError :: PrometheusClient ( format ! (
87+ "HTTP request failed for metric '{}': {}" ,
88+ metric_name, e
89+ ) )
90+ } ) ?;
91+
92+ let status = response. status ( ) ;
93+
94+ if status == reqwest:: StatusCode :: SERVICE_UNAVAILABLE {
95+ warn ! (
96+ "Prometheus returned 503 for metric '{}' (attempt {}/{}); retrying in {}s" ,
97+ metric_name,
98+ attempt,
99+ MAX_RETRIES ,
100+ RETRY_DELAY . as_secs( ) ,
101+ ) ;
102+ thread:: sleep ( RETRY_DELAY ) ;
103+ continue ;
104+ }
105+
106+ if !status. is_success ( ) {
107+ return Err ( ControllerError :: PrometheusClient ( format ! (
108+ "Prometheus returned HTTP {} for metric '{}'" ,
109+ status, metric_name
110+ ) ) ) ;
111+ }
112+
113+ let body: serde_json:: Value = response. json ( ) . map_err ( |e| {
77114 ControllerError :: PrometheusClient ( format ! (
78- "HTTP request failed for metric '{}': {}" ,
115+ "Failed to parse Prometheus response for metric '{}': {}" ,
79116 metric_name, e
80117 ) )
81118 } ) ?;
82119
83- if !response. status ( ) . is_success ( ) {
84- return Err ( ControllerError :: PrometheusClient ( format ! (
85- "Prometheus returned HTTP {} for metric '{}'" ,
86- response. status( ) ,
87- metric_name
88- ) ) ) ;
89- }
90-
91- let body: serde_json:: Value = response. json ( ) . map_err ( |e| {
92- ControllerError :: PrometheusClient ( format ! (
93- "Failed to parse Prometheus response for metric '{}': {}" ,
94- metric_name, e
95- ) )
96- } ) ?;
120+ let data = match body. get ( "data" ) . and_then ( |d| d. as_array ( ) ) {
121+ Some ( arr) => arr,
122+ None => {
123+ warn ! (
124+ "Prometheus returned no 'data' array for metric '{}'; skipping" ,
125+ metric_name
126+ ) ;
127+ return Ok ( None ) ;
128+ }
129+ } ;
97130
98- let data = match body. get ( "data" ) . and_then ( |d| d. as_array ( ) ) {
99- Some ( arr) => arr,
100- None => {
131+ if data. is_empty ( ) {
101132 warn ! (
102- "Prometheus returned no 'data' array for metric '{}'; skipping" ,
133+ "Prometheus returned no series for metric '{}' in the last 5 minutes ; skipping" ,
103134 metric_name
104135 ) ;
105136 return Ok ( None ) ;
106137 }
107- } ;
108-
109- if data. is_empty ( ) {
110- warn ! (
111- "Prometheus returned no series for metric '{}' in the last 5 minutes; skipping" ,
112- metric_name
113- ) ;
114- return Ok ( None ) ;
115- }
116138
117- // Collect all unique label key names across all returned series,
118- // filtering out internal __*__ labels.
119- let mut label_keys: HashSet < String > = HashSet :: new ( ) ;
120- for series in data {
121- if let Some ( labels) = series. as_object ( ) {
122- for key in labels. keys ( ) {
123- if !key. starts_with ( "__" ) {
124- label_keys. insert ( key. clone ( ) ) ;
139+ // Collect all unique label key names across all returned series,
140+ // filtering out internal __*__ labels.
141+ let mut label_keys: HashSet < String > = HashSet :: new ( ) ;
142+ for series in data {
143+ if let Some ( labels) = series. as_object ( ) {
144+ for key in labels. keys ( ) {
145+ if !key. starts_with ( "__" ) {
146+ label_keys. insert ( key. clone ( ) ) ;
147+ }
125148 }
126149 }
127150 }
151+
152+ return Ok ( Some ( label_keys. into_iter ( ) . collect ( ) ) ) ;
128153 }
129154
130- Ok ( Some ( label_keys. into_iter ( ) . collect ( ) ) )
155+ Err ( ControllerError :: PrometheusClient ( format ! (
156+ "Prometheus returned 503 for metric '{}' after {} attempts; giving up" ,
157+ metric_name, MAX_RETRIES
158+ ) ) )
131159}
132160
133161/// Build a `PromQLSchema` by querying Prometheus for each metric name found in the given
0 commit comments