Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
171 changes: 171 additions & 0 deletions src/sentry/monitors/logic/incident_occurrence.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
from __future__ import annotations

import logging
import uuid
from collections import Counter
from collections.abc import Mapping, Sequence
from datetime import datetime, timezone
from typing import TYPE_CHECKING

from django.utils.text import get_text_list
from django.utils.translation import gettext_lazy as _

from sentry.issues.grouptype import MonitorIncidentType
from sentry.monitors.models import (
CheckInStatus,
MonitorCheckIn,
MonitorEnvironment,
MonitorIncident,
)
from sentry.monitors.types import SimpleCheckIn

if TYPE_CHECKING:
from django.utils.functional import _StrPromise

logger = logging.getLogger(__name__)


def create_incident_occurrence(
failed_checkins: Sequence[SimpleCheckIn],
failed_checkin: MonitorCheckIn,
incident: MonitorIncident,
received: datetime | None,
) -> None:
from sentry.issues.issue_occurrence import IssueEvidence, IssueOccurrence
from sentry.issues.producer import PayloadType, produce_occurrence_to_kafka

monitor_env = failed_checkin.monitor_environment

if monitor_env is None:
return

current_timestamp = datetime.now(timezone.utc)

# Get last successful check-in to show in evidence display
last_successful_checkin_timestamp = "Never"
last_successful_checkin = monitor_env.get_last_successful_checkin()
if last_successful_checkin:
last_successful_checkin_timestamp = last_successful_checkin.date_added.isoformat()

occurrence = IssueOccurrence(
id=uuid.uuid4().hex,
resource_id=None,
project_id=monitor_env.monitor.project_id,
event_id=uuid.uuid4().hex,
fingerprint=[incident.grouphash],
type=MonitorIncidentType,
issue_title=f"Monitor failure: {monitor_env.monitor.name}",
subtitle="Your monitor has reached its failure threshold.",
evidence_display=[
IssueEvidence(
name="Failure reason",
value=str(get_failure_reason(failed_checkins)),
important=True,
),
IssueEvidence(
name="Environment",
value=monitor_env.get_environment().name,
important=False,
),
IssueEvidence(
name="Last successful check-in",
value=last_successful_checkin_timestamp,
important=False,
),
],
evidence_data={},
culprit="",
detection_time=current_timestamp,
level="error",
assignee=monitor_env.monitor.owner_actor,
)

if failed_checkin.trace_id:
trace_id = failed_checkin.trace_id.hex
else:
trace_id = None

event_data = {
"contexts": {"monitor": get_monitor_environment_context(monitor_env)},
"environment": monitor_env.get_environment().name,
"event_id": occurrence.event_id,
"fingerprint": [incident.grouphash],
"platform": "other",
"project_id": monitor_env.monitor.project_id,
# We set this to the time that the checkin that triggered the occurrence was written to relay if available
"received": (received if received else current_timestamp).isoformat(),
"sdk": None,
"tags": {
"monitor.id": str(monitor_env.monitor.guid),
"monitor.slug": str(monitor_env.monitor.slug),
"monitor.incident": str(incident.id),
},
"timestamp": current_timestamp.isoformat(),
}

if trace_id:
event_data["contexts"]["trace"] = {"trace_id": trace_id, "span_id": None}

produce_occurrence_to_kafka(
payload_type=PayloadType.OCCURRENCE,
occurrence=occurrence,
event_data=event_data,
)


HUMAN_FAILURE_STATUS_MAP: Mapping[int, _StrPromise] = {
CheckInStatus.ERROR: _("error"),
CheckInStatus.MISSED: _("missed"),
CheckInStatus.TIMEOUT: _("timeout"),
}

# Exists due to the vowel differences (A vs An) in the statuses
SINGULAR_HUMAN_FAILURE_MAP: Mapping[int, _StrPromise] = {
CheckInStatus.ERROR: _("An error check-in was detected"),
CheckInStatus.MISSED: _("A missed check-in was detected"),
CheckInStatus.TIMEOUT: _("A timeout check-in was detected"),
}


def get_failure_reason(failed_checkins: Sequence[SimpleCheckIn]):
"""
Builds a humam readible string from a list of failed check-ins.

"3 missed check-ins detected"
"2 missed check-ins, 1 timeout check-in and 1 error check-in were detected"
"A failed check-in was detected"
"""

status_counts = Counter(
checkin["status"]
for checkin in failed_checkins
if checkin["status"] in HUMAN_FAILURE_STATUS_MAP.keys()
)

if sum(status_counts.values()) == 1:
return SINGULAR_HUMAN_FAILURE_MAP[list(status_counts.keys())[0]]

human_status = get_text_list(
[
"%(count)d %(status)s" % {"count": count, "status": HUMAN_FAILURE_STATUS_MAP[status]}
for status, count in status_counts.items()
],
last_word=_("and"),
)

return _("%(problem_checkins)s check-ins detected") % {"problem_checkins": human_status}
Comment on lines +130 to +156

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Small cleanups in get_failure_reason (RUF015 + minor polish)

Two small improvements here:

  1. Key selection for the singular case: Ruff is right that list(status_counts.keys())[0] is wasteful; you can avoid materializing a list:
-    if sum(status_counts.values()) == 1:
-        return SINGULAR_HUMAN_FAILURE_MAP[list(status_counts.keys())[0]]
+    if sum(status_counts.values()) == 1:
+        key = next(iter(status_counts.keys()))
+        return SINGULAR_HUMAN_FAILURE_MAP[key]
  1. Docstring typo (optional): “humam readible” → “human readable” for clarity.

These changes are non-functional but bring the function in line with style and static-analysis guidance.

📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
def get_failure_reason(failed_checkins: Sequence[SimpleCheckIn]):
"""
Builds a humam readible string from a list of failed check-ins.
"3 missed check-ins detected"
"2 missed check-ins, 1 timeout check-in and 1 error check-in were detected"
"A failed check-in was detected"
"""
status_counts = Counter(
checkin["status"]
for checkin in failed_checkins
if checkin["status"] in HUMAN_FAILURE_STATUS_MAP.keys()
)
if sum(status_counts.values()) == 1:
return SINGULAR_HUMAN_FAILURE_MAP[list(status_counts.keys())[0]]
human_status = get_text_list(
[
"%(count)d %(status)s" % {"count": count, "status": HUMAN_FAILURE_STATUS_MAP[status]}
for status, count in status_counts.items()
],
last_word=_("and"),
)
return _("%(problem_checkins)s check-ins detected") % {"problem_checkins": human_status}
def get_failure_reason(failed_checkins: Sequence[SimpleCheckIn]):
"""
Builds a humam readible string from a list of failed check-ins.
"3 missed check-ins detected"
"2 missed check-ins, 1 timeout check-in and 1 error check-in were detected"
"A failed check-in was detected"
"""
status_counts = Counter(
checkin["status"]
for checkin in failed_checkins
if checkin["status"] in HUMAN_FAILURE_STATUS_MAP.keys()
)
if sum(status_counts.values()) == 1:
key = next(iter(status_counts.keys()))
return SINGULAR_HUMAN_FAILURE_MAP[key]
human_status = get_text_list(
[
"%(count)d %(status)s" % {"count": count, "status": HUMAN_FAILURE_STATUS_MAP[status]}
for status, count in status_counts.items()
],
last_word=_("and"),
)
return _("%(problem_checkins)s check-ins detected") % {"problem_checkins": human_status}
🧰 Tools
🪛 Ruff (0.14.5)

146-146: Prefer next(iter(status_counts.keys())) over single element slice

Replace with next(iter(status_counts.keys()))

(RUF015)

🤖 Prompt for AI Agents
In src/sentry/monitors/logic/incident_occurrence.py around lines 130 to 156, the
singular-case key extraction is wastefully creating a list with
list(status_counts.keys())[0] and the docstring contains a typo "humam
readible"; change the singular-case selection to use an iterator (e.g.
next(iter(status_counts))) to avoid materializing a list, and fix the docstring
text to "human readable".



def get_monitor_environment_context(monitor_environment: MonitorEnvironment):
config = monitor_environment.monitor.config.copy()
if "schedule_type" in config:
config["schedule_type"] = monitor_environment.monitor.get_schedule_type_display()

return {
"id": str(monitor_environment.monitor.guid),
"slug": str(monitor_environment.monitor.slug),
"name": monitor_environment.monitor.name,
"config": monitor_environment.monitor.config,
"status": monitor_environment.get_status_display(),
"type": monitor_environment.monitor.get_type_display(),
}
Comment on lines +159 to +171

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Bug: schedule_type override in context is never applied

get_monitor_environment_context builds a config copy and conditionally normalizes schedule_type, but then returns the original monitor_environment.monitor.config instead of the modified copy:

config = monitor_environment.monitor.config.copy()
if "schedule_type" in config:
    config["schedule_type"] = monitor_environment.monitor.get_schedule_type_display()

return {
    # ...
    "config": monitor_environment.monitor.config,  # ignores modified `config`
}

As written, the schedule type in the returned context will remain the raw stored value, not the human-readable display value.

You likely meant to return the adjusted config:

 def get_monitor_environment_context(monitor_environment: MonitorEnvironment):
     config = monitor_environment.monitor.config.copy()
     if "schedule_type" in config:
         config["schedule_type"] = monitor_environment.monitor.get_schedule_type_display()
 
     return {
         "id": str(monitor_environment.monitor.guid),
         "slug": str(monitor_environment.monitor.slug),
         "name": monitor_environment.monitor.name,
-        "config": monitor_environment.monitor.config,
+        "config": config,
         "status": monitor_environment.get_status_display(),
         "type": monitor_environment.monitor.get_type_display(),
     }

This keeps the original config immutable and ensures consumers see the normalized schedule type.

🤖 Prompt for AI Agents
In src/sentry/monitors/logic/incident_occurrence.py around lines 159 to 171, the
function modifies a local copy named `config` to normalize `schedule_type` but
then returns the original `monitor_environment.monitor.config`, so the
normalized value is never used; change the returned dictionary to use the
modified `config` variable instead of `monitor_environment.monitor.config` so
the display-friendly `schedule_type` is included while keeping the original
config immutable.

104 changes: 104 additions & 0 deletions src/sentry/monitors/logic/incidents.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
from __future__ import annotations

import logging
from datetime import datetime
from typing import cast

from sentry.monitors.logic.incident_occurrence import create_incident_occurrence
from sentry.monitors.models import CheckInStatus, MonitorCheckIn, MonitorIncident, MonitorStatus
from sentry.monitors.types import SimpleCheckIn

logger = logging.getLogger(__name__)


def try_incident_threshold(
failed_checkin: MonitorCheckIn,
failure_issue_threshold: int,
received: datetime | None,
) -> bool:
from sentry.signals import monitor_environment_failed

monitor_env = failed_checkin.monitor_environment

if monitor_env is None:
return False

# check to see if we need to update the status
if monitor_env.status in [MonitorStatus.OK, MonitorStatus.ACTIVE]:
if failure_issue_threshold == 1:
previous_checkins: list[SimpleCheckIn] = [
{
"id": failed_checkin.id,
"date_added": failed_checkin.date_added,
"status": failed_checkin.status,
}
]
else:
previous_checkins = cast(
list[SimpleCheckIn],
# Using .values for performance reasons
MonitorCheckIn.objects.filter(
monitor_environment=monitor_env, date_added__lte=failed_checkin.date_added
)
.order_by("-date_added")
.values("id", "date_added", "status"),
)

# reverse the list after slicing in order to start with oldest check-in
previous_checkins = list(reversed(previous_checkins[:failure_issue_threshold]))

# If we have any successful check-ins within the threshold of
# commits we have NOT reached an incident state
if any([checkin["status"] == CheckInStatus.OK for checkin in previous_checkins]):
return False

# change monitor status + update fingerprint timestamp
Comment on lines +26 to +55

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

🧩 Analysis chain

Incident threshold can trigger early when there are fewer failures than failure_issue_threshold

The logic for failure_issue_threshold > 1 slices and reverses recent check-ins but never verifies that you actually have at least failure_issue_threshold check-ins before treating the threshold as reached:

previous_checkins = list(reversed(previous_checkins[:failure_issue_threshold]))
# ...
if any(checkin["status"] == CheckInStatus.OK for checkin in previous_checkins):
    return False
# otherwise we proceed to ERROR + incident creation

If the monitor has fewer than failure_issue_threshold total check-ins, all of which are failures, this will still promote the env to ERROR and create an incident, effectively ignoring the configured threshold.

If the intended semantics are “require N consecutive failing check-ins,” you likely want an explicit length guard, e.g.:

-            previous_checkins = list(reversed(previous_checkins[:failure_issue_threshold]))
-
-            # If we have any successful check-ins within the threshold of
-            # commits we have NOT reached an incident state
-            if any(checkin["status"] == CheckInStatus.OK for checkin in previous_checkins):
-                return False
+            previous_checkins = list(reversed(previous_checkins[:failure_issue_threshold]))
+
+            # If we haven't yet hit the configured number of check-ins, bail out
+            if len(previous_checkins) < failure_issue_threshold:
+                return False
+
+            # If we have any successful check-ins within the threshold window,
+            # we have NOT reached an incident state
+            if any(checkin["status"] == CheckInStatus.OK for checkin in previous_checkins):
+                return False

Please confirm the intended semantics and adjust accordingly.



---

🏁 Script executed:

```shell
#!/bin/bash
# Extract the full try_incident_threshold function to understand complete context
rg -A 100 'def try_incident_threshold' src/sentry/monitors/logic/incidents.py -n

Length of output: 3681


🏁 Script executed:

#!/bin/bash
# Look for tests related to this function
fd -e py -path '*/test*' | xargs rg -l 'try_incident_threshold' 2>/dev/null | head -5

Length of output: 248


🏁 Script executed:

#!/bin/bash
# Search for where try_incident_threshold is called to understand usage
rg -n 'try_incident_threshold' --type py -B2 -A2 | head -50

Length of output: 966


🏁 Script executed:

#!/bin/bash
# Check the MonitorStatus and CheckInStatus enums to understand the status types
rg -A 10 'class MonitorStatus|class CheckInStatus' --type py | head -40

Length of output: 1293


🏁 Script executed:

#!/bin/bash
# Look for any validation or length checks on previous_checkins or similar patterns
rg -n 'len\(previous_checkins\)|failure_issue_threshold' src/sentry/monitors/logic/incidents.py -B2 -A2

Length of output: 678


🏁 Script executed:

#!/bin/bash
# Search for incident creation to understand the downstream impact
rg -n 'incident_issue|starting_checkin' src/sentry/monitors/logic/incidents.py -B3 -A3 | head -60

Length of output: 566


🏁 Script executed:

#!/bin/bash
# Look at the full file structure to understand the broader context
head -100 src/sentry/monitors/logic/incidents.py

Length of output: 3662


🏁 Script executed:

#!/bin/bash
# Check for any related monitor environment update logic
rg -n 'monitor_env.status.*ERROR|MonitorStatus.ERROR' src/sentry/monitors/logic/incidents.py -B2 -A2

Length of output: 369


🏁 Script executed:

#!/bin/bash
# Check database/ORM query to understand what .values returns
rg -n 'SimpleCheckIn' src/sentry/monitors/logic/incidents.py -B3 -A3

Length of output: 973


🏁 Script executed:

#!/bin/bash
# Look for type hints or definitions of SimpleCheckIn
fd -e py | xargs rg -l 'SimpleCheckIn' | head -10

Length of output: 116


🏁 Script executed:

#!/bin/bash
# Check if there are integration tests that exercise the threshold logic
fd -e py -path '*/test*monitor*' | xargs rg -l 'failure_issue_threshold' 2>/dev/null

Length of output: 643


🏁 Script executed:

#!/bin/bash
# Search for guard clauses related to check-in counts
rg -n 'if.*len.*check' src/sentry/monitors/logic/incidents.py -B1 -A1

Length of output: 0


🏁 Script executed:

#!/bin/bash
# Look at the logic flow around the threshold logic more carefully
sed -n '26,80p' src/sentry/monitors/logic/incidents.py

Length of output: 2269



---

🏁 Script executed:

```shell
#!/bin/bash
# Check the test file to understand intended behavior
cat tests/sentry/monitors/logic/test_mark_failed.py | head -150

Length of output: 5417


🏁 Script executed:

#!/bin/bash
# Search for test cases specifically about failure_issue_threshold
rg -n 'failure_issue_threshold' tests/sentry/monitors/logic/test_mark_failed.py -B5 -A10

Length of output: 9843


🏁 Script executed:

#!/bin/bash
# Check mark_failed.py to understand how threshold is determined and used
cat src/sentry/monitors/logic/mark_failed.py

Length of output: 2882


🏁 Script executed:

#!/bin/bash
# Look for any documentation or comments about what the threshold means
rg -n 'failure_issue_threshold|failure.*threshold' src/sentry/monitors/ -B2 -A2 | head -80

Length of output: 3624


Add explicit threshold validation to prevent early incident creation

The code lacks a length check before treating insufficient check-ins as a threshold breach. If a monitor has fewer total failures than failure_issue_threshold (with no intervening OK check-ins), the incident triggers prematurely.

The test passes only because it creates an initial OK check-in that eventually gets pushed outside the slice window. An uncovered edge case: threshold=8 with only 3 actual failures triggers an incident anyway.

Add the length guard:

previous_checkins = list(reversed(previous_checkins[:failure_issue_threshold]))

# Ensure we have accumulated enough failures to meet the threshold
if len(previous_checkins) < failure_issue_threshold:
    return False

# If we have any successful check-ins within the threshold window,
# we have NOT reached an incident state
if any([checkin["status"] == CheckInStatus.OK for checkin in previous_checkins]):
    return False
🤖 Prompt for AI Agents
In src/sentry/monitors/logic/incidents.py around lines 26 to 55, the logic
slices and reverses previous_checkins but does not verify that the number of
returned check-ins actually meets failure_issue_threshold, which can cause
incidents to be created when there are fewer failures than the threshold; modify
the flow so after reversing and slicing previous_checkins you check if
len(previous_checkins) < failure_issue_threshold and return False if so, and
only then proceed to check for any CheckInStatus.OK entries (return False if any
exist) before changing the monitor status.

monitor_env.status = MonitorStatus.ERROR
monitor_env.save(update_fields=("status",))

starting_checkin = previous_checkins[0]

incident: MonitorIncident | None
incident, _ = MonitorIncident.objects.get_or_create(
monitor_environment=monitor_env,
resolving_checkin=None,
defaults={
"monitor": monitor_env.monitor,
"starting_checkin_id": starting_checkin["id"],
"starting_timestamp": starting_checkin["date_added"],
},
)

elif monitor_env.status == MonitorStatus.ERROR:
# if monitor environment has a failed status, use the failed
# check-in and send occurrence
previous_checkins = [
{
"id": failed_checkin.id,
"date_added": failed_checkin.date_added,
"status": failed_checkin.status,
}
]

# get the active incident from the monitor environment
incident = monitor_env.active_incident
else:
# don't send occurrence for other statuses
return False

# Only create an occurrence if:
# - We have an active incident and fingerprint
# - The monitor and env are not muted
if not monitor_env.monitor.is_muted and not monitor_env.is_muted and incident:
checkins = MonitorCheckIn.objects.filter(id__in=[c["id"] for c in previous_checkins])
for checkin in checkins:
create_incident_occurrence(
previous_checkins,
checkin,
incident,
received=received,
)

monitor_environment_failed.send(monitor_environment=monitor_env, sender=type(monitor_env))

return True
Loading