diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..1fd2c2f --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,38 @@ +name: CI - Build and Push QuickTicket Images + +on: + push: + branches: [ main ] + +jobs: + build-and-push: + runs-on: ubuntu-latest + permissions: + packages: write + contents: write + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Log in to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Build and push gateway + run: | + docker build -t ghcr.io/${{ github.actor }}/quickticket-gateway:${{ github.sha }} ./app/gateway + docker push ghcr.io/${{ github.actor }}/quickticket-gateway:${{ github.sha }} + + - name: Build and push events + run: | + docker build -t ghcr.io/${{ github.actor }}/quickticket-events:${{ github.sha }} ./app/events + docker push ghcr.io/${{ github.actor }}/quickticket-events:${{ github.sha }} + + - name: Build and push payments + run: | + docker build -t ghcr.io/${{ github.actor }}/quickticket-payments:${{ github.sha }} ./app/payments + docker push ghcr.io/${{ github.actor }}/quickticket-payments:${{ github.sha }} diff --git a/app/events/.dockerignore b/app/events/.dockerignore new file mode 100644 index 0000000..ce2bb52 --- /dev/null +++ b/app/events/.dockerignore @@ -0,0 +1,10 @@ +__pycache__ +*.pyc +*.pyo +.git +.gitignore +.env +README.md +*.md +.vscode +__MACOSX diff --git a/app/gateway/.dockerignore b/app/gateway/.dockerignore new file mode 100644 index 0000000..ce2bb52 --- /dev/null +++ b/app/gateway/.dockerignore @@ -0,0 +1,10 @@ +__pycache__ +*.pyc +*.pyo +.git +.gitignore +.env +README.md +*.md +.vscode +__MACOSX diff --git a/app/gateway/Dockerfile b/app/gateway/Dockerfile index 68ef075..ffcaed8 100644 --- a/app/gateway/Dockerfile +++ b/app/gateway/Dockerfile @@ -3,7 +3,9 @@ FROM python:3.13-slim WORKDIR /app COPY requirements.txt . RUN pip install --no-cache-dir -r requirements.txt +RUN addgroup --system app && adduser --system --ingroup app app COPY main.py . - +RUN chown -R app:app /app +USER app EXPOSE 8080 CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8080"] diff --git a/app/gateway/main.py b/app/gateway/main.py index c86db33..ef164ff 100644 --- a/app/gateway/main.py +++ b/app/gateway/main.py @@ -310,14 +310,10 @@ async def _notify_order_confirmed(reservation_id: str): log.warning(f"notify failed (non-critical) order={reservation_id} err={e}") +@app.post("/reserve/{reservation_id}/pay") @app.post("/reserve/{reservation_id}/pay") async def pay_reservation(reservation_id: str): - # 1. Call payments — wrapped in circuit breaker + retry. - # - # Composition order matters: cb.call(retry(_charge)) means each CB-tracked - # invocation includes its retries internally; the CB only sees the FINAL - # outcome. The reverse — retry(cb.call(_charge)) — would retry past the - # CircuitOpenError, defeating the fast-fail. See lab 11 §11.4. + """Pay for reservation with graceful degradation when payments service is down.""" async def _charge(): resp = await client.post( f"{PAYMENTS_URL}/charge", @@ -327,20 +323,27 @@ async def _charge(): return resp try: + # Try to call payments with circuit breaker + retry pay_resp = await payments_cb.call(lambda: call_with_retry(_charge, target="payments")) payment_ref = pay_resp.json().get("payment_ref", "unknown") - except CircuitOpenError: - log.error("circuit open, skipping payments call") - raise HTTPException(503, "Payment service temporarily unavailable (circuit open)") - except httpx.TimeoutException: - raise HTTPException(504, "Payment service timeout") + except (CircuitOpenError, httpx.ConnectError, httpx.TimeoutException, httpx.RequestError) as e: + # === GRACEFUL DEGRADATION === + log.warning(f"Payments service unavailable for reservation {reservation_id}: {e}") + return JSONResponse( + status_code=503, + content={ + "error": "payments_unavailable", + "message": "Payment service is temporarily down. Your reservation is held — try again in a few minutes.", + "reservation_id": reservation_id + } + ) except httpx.HTTPStatusError as e: raise HTTPException(e.response.status_code, "Payment failed") except Exception as e: log.error(f"payment error: {e}") raise HTTPException(502, "Payment service unavailable") - # 2. Confirm reservation in events. + # 2. Confirm reservation in events (only if payment succeeded) try: confirm_resp = await client.post( f"{EVENTS_URL}/reservations/{reservation_id}/confirm", @@ -352,7 +355,7 @@ async def _charge(): log.error(f"confirm error after payment: {e}") raise HTTPException(500, "Payment succeeded but confirmation failed — contact support") - # 3. Fire-and-forget notify (don't await → don't add latency, don't fail user). + # 3. Fire-and-forget notify asyncio.create_task(_notify_order_confirmed(reservation_id)) return result diff --git a/app/payments/.dockerignore b/app/payments/.dockerignore new file mode 100644 index 0000000..ce2bb52 --- /dev/null +++ b/app/payments/.dockerignore @@ -0,0 +1,10 @@ +__pycache__ +*.pyc +*.pyo +.git +.gitignore +.env +README.md +*.md +.vscode +__MACOSX diff --git a/docker-compose.monitoring.yaml b/docker-compose.monitoring.yaml index 06e19bd..2f70089 100644 --- a/docker-compose.monitoring.yaml +++ b/docker-compose.monitoring.yaml @@ -5,10 +5,10 @@ services: - "9090:9090" volumes: - ../monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro + - ../monitoring/prometheus/rules.yml:/etc/prometheus/rules.yml:ro # ← добавь эту строку command: - "--config.file=/etc/prometheus/prometheus.yml" - "--storage.tsdb.retention.time=7d" - grafana: image: grafana/grafana:13.0.1 ports: diff --git a/monitoring/prometheus/prometheus.yml b/monitoring/prometheus/prometheus.yml new file mode 100644 index 0000000..3a5352b --- /dev/null +++ b/monitoring/prometheus/prometheus.yml @@ -0,0 +1,23 @@ +global: + scrape_interval: 15s + evaluation_interval: 15s + +rule_files: + - "rules.yml" + +scrape_configs: + - job_name: 'gateway' + static_configs: + - targets: ['gateway:8080'] + + - job_name: 'events' + static_configs: + - targets: ['events:8081'] + + - job_name: 'payments' + static_configs: + - targets: ['payments:8082'] + + - job_name: 'prometheus' + static_configs: + - targets: ['localhost:9090'] diff --git a/monitoring/prometheus/rules.yml b/monitoring/prometheus/rules.yml new file mode 100644 index 0000000..9c98f7a --- /dev/null +++ b/monitoring/prometheus/rules.yml @@ -0,0 +1,12 @@ +groups: + - name: quickticket_slo_rules + interval: 30s + rules: + - record: gateway:sli_availability:ratio_rate5m + expr: sum(rate(gateway_requests_total{status!~"5.."}[5m])) / sum(rate(gateway_requests_total[5m])) + + - record: gateway:sli_latency_500ms:ratio_rate5m + expr: sum(rate(gateway_request_duration_seconds_bucket{le="0.5"}[5m])) / sum(rate(gateway_request_duration_seconds_count[5m])) + + - record: gateway:error_budget_burn_rate:ratio_rate5m + expr: (1 - gateway:sli_availability:ratio_rate5m) / (1 - 0.995) diff --git a/submissions/lab1.md b/submissions/lab1.md new file mode 100644 index 0000000..3af1583 --- /dev/null +++ b/submissions/lab1.md @@ -0,0 +1,179 @@ +# Lab 1 — SRE Philosophy: Deploy, Break, Understand + +## Docker Compose Status + +All 5 services are running successfully: + +```bash +NAME IMAGE STATUS PORTS +app-events-1 app-events Up 0.0.0.0:8081->8081/tcp +app-gateway-1 app-gateway Up 0.0.0.0:3080->8080/tcp +app-payments-1 app-payments Up 0.0.0.0:8082->8082/tcp +app-postgres-1 postgres:17-alpine Up (healthy) 0.0.0.0:5432->5432/tcp +app-redis-1 redis:7-alpine Up (healthy) 0.0.0.0:6379->6379/tcp +``` + +## Critical Path (Everything Working) + +### 1. List Events + +```json +[ + { + "id": 1, + "name": "Go Conference 2026", + "venue": "Main Hall A", + "date": "2026-09-15T09:00:00+00:00", + "total_tickets": 100, + "price_cents": 5000, + "available": 99 + }, + { + "id": 4, + "name": "Python Workshop", + "venue": "Lab 301", + "date": "2026-09-22T14:00:00+00:00", + "total_tickets": 25, + "price_cents": 2000, + "available": 25 + }, + { + "id": 2, + "name": "SRE Meetup", + "venue": "Room 204", + "date": "2026-10-01T18:00:00+00:00", + "total_tickets": 30, + "price_cents": 0, + "available": 30 + }, + { + "id": 5, + "name": "Kubernetes Deep Dive", + "venue": "Auditorium B", + "date": "2026-10-10T10:00:00+00:00", + "total_tickets": 80, + "price_cents": 8000, + "available": 80 + }, + { + "id": 3, + "name": "Cloud Native Summit", + "venue": "Expo Center", + "date": "2026-11-20T10:00:00+00:00", + "total_tickets": 500, + "price_cents": 15000, + "available": 500 + } +] +``` + +### 2. Reserve a Ticket + +```json +{ + "reservation_id": "a3370485-51ea-46bf-a3b1-c6cf7a101df4", + "event_id": 1, + "quantity": 1, + "total_cents": 5000, + "expires_in_seconds": 300 +} +``` + +### 3. Pay for Reservation + +```json +{ + "order_id": "a3370485-51ea-46bf-a3b1-c6cf7a101df4", + "event_id": 1, + "quantity": 1, + "total_cents": 5000, + "status": "confirmed" +} +``` + +### 4. Health Check + +```json +{ + "status": "healthy", + "checks": { + "events": "ok", + "payments": "ok", + "circuit_payments": "CLOSED" + } +} +``` + +## Dependency Map + +```mermaid +graph TD + Gateway --> Events + Gateway --> Payments + Events --> Postgres + Events --> Redis +``` + +## Failure Table + +| Component Killed | Events List | Reserve | Pay | Health Check | User Impact | +| ---------------- | ----------- | ------- | ----- | ------------ | -------------------------------- | +| payments | Works | Works | Fails | degraded | Can reserve but cannot pay | +| events | Fails | Fails | Fails | degraded | Cannot browse or buy tickets | +| redis | Works | Works | Works | ok | Minor impact | +| postgres | Fails | Fails | Fails | degraded | Events service completely broken | + +## Load Generator Test + +I ran the load generator: + +```bash +../loadgen/run.sh 5 30 +``` + +While it was running, I stopped the payments service. The error rate increased significantly, but list and reserve endpoints continued working. This demonstrates the blast radius of the payments service and validates graceful degradation behavior. + +## Task 2 — Graceful Degradation + +Modified `gateway/main.py` to return a clear 503 response when payments are unavailable. + +Example response: + +```json +{ + "error": "payments_unavailable", + "message": "Payment service is temporarily down. Your reservation is held — try again in a few minutes.", + "reservation_id": "..." +} +``` + +Results: + +* Reserve endpoint continued working. +* Pay endpoint returned a friendly error message. +* User experience degraded gracefully instead of failing unexpectedly. + +## Bonus Task — Resource Usage + +### Idle + +```bash +NAME CPU % MEM USAGE +app-gateway-1 0.25% 38.11MiB +app-events-1 0.25% 41MiB +app-payments-1 0.23% 32.96MiB +app-postgres-1 2.59% 23.89MiB +app-redis-1 0.86% 3.66MiB +``` + +### Observations + +* PostgreSQL consumed the highest CPU while idle. +* Redis used the least memory. +* Gateway and Events services increased CPU usage under load because they handled incoming traffic. +* When Payments was unavailable Gateway retained requests longer and showed increased resource utilization. + +## GitHub Community +I starred the course repository and the `simple-container-com/api` project. +I followed the professor (@Cre-eD), TAs (@Naghme98, @pierrepicaud), and several classmates. +Starring repositories supports maintainers and helps useful projects gain visibility. Following developers helps me learn from their work and expand my professional network. diff --git a/submissions/lab2.md b/submissions/lab2.md new file mode 100644 index 0000000..f80e518 --- /dev/null +++ b/submissions/lab2.md @@ -0,0 +1,111 @@ +# Lab 2 Containerization: Inspect, Understand, Optimize + +## Task 1 Docker Inspection фтв Operations + +### 1.1 Image inspection +```bash +docker images | grep app +``` + +I check app images here. + +- app-events:latest — about 233MB +- app-gateway:latest — about 213MB +- app-payments:latest — about 211MB + +Biggest part is Python install and pip packages. + +### 1.2 Container inspection + +I check IP address of services: + +- gateway: 172.21.0.6 +- events: 172.21.0.5 +- payments: 172.21.0.4 + +Payments env variables: + +- PAYMENT_FAILURE_RATE=0.0 +- PAYMENT_LATENCY_MS=0 + +### 1.3 Live debugging with exec + +```bash +docker exec app-gateway-1 whoami +# root (before Task 2) +``` + +DNS resolver is: + +- nameserver 127.0.0.11 + +Check connection: + +- http://events:8081/health -> works +- http://payments:8082/health -> works + +So services talk by names like events and payments. Docker DNS help here. + +### 1.4 Logs analysis + +Logs show request flow: + +- Gateway -> Events (reserve) +- Gateway -> Payments (charge) +- Events -> confirm + +### 1.5 Network inspection + +All containers are in network `app_default`. + +IP range is like `172.21.0.0/16`. + +--- + +## Task 2 — Dockerfile Optimization + +I do some small optimization: + +- make `.dockerignore` in `gateway/`, `events/`, `payments/` +- update `gateway/Dockerfile` +- add non-root user `app` + +Check: + +```bash +docker exec app-gateway-1 whoami +# app +``` + +So gateway now run not as root. + +--- + +## Bonus Task — Trace a Request Across Services + +I trace one ticket buy request. + +Reservation ID: `cbb0db56-1b8b-4b10-a0f2-25b5e3378f3e` + +Log flow: + +- Gateway get `POST /events/1/reserve` -> `200 OK` +- Events reserve ticket +- Gateway -> Payments `/charge` -> `200 OK` +- Gateway -> Events `/confirm` -> `200 OK` +- User get confirmation + +End to end time is about 100-200 ms. It is fast. + +--- + +## Conclusions + +In Lab 2 I learn: + +- Docker image layers +- service discovery by name +- how to debug with `docker exec` and `logs` +- basic optimization and security with non-root user + +I am ready for next labs diff --git a/submissions/lab3.md b/submissions/lab3.md new file mode 100644 index 0000000..47519ff --- /dev/null +++ b/submissions/lab3.md @@ -0,0 +1,44 @@ +# Lab 3 Monitoring, Observability & SLOs + +## Task 1 Monitoring Setup + +**Prometheus Configuration** (`monitoring/prometheus/prometheus.yml`) +I set scrape targets for gateway, events and payments. + +**Monitoring Stack** is running now, 7 services. + +**Prometheus Targets** are all **up**. + +**Golden Signals Dashboard** in Grafana: + +- I add **Latency** panel (p50, p95, p99) +- I add **Saturation** panel (DB pool gauge) + +When I stop payments, I can see big increase in Error Rate and Service Health go down. + +## Task 2 SLOs and Recording Rules + +I created `monitoring/prometheus/rules.yml` with three recording rules: + +- `gateway:sli_availability:ratio_rate5m` +- `gateway:sli_latency_500ms:ratio_rate5m` +- `gateway:error_budget_burn_rate:ratio_rate5m` + +Rules are loaded in Prometheus successfully. + +**SLI/SLO:** + +- Availability SLO: **99.5%** +- Latency SLO (< 500ms): **95%** + +## Bonus Task Failure Correlation + +I run load, inject failure in payments and watch dashboard + logs. + +**Conclusion:** Failure first show in **Error Rate**, then in **Service Health**. Latency increase later. + +## Final + +In Lab 3 I setup monitoring for QuickTicket with Prometheus + Grafana, make Golden Signals dashboard and define basic SLOs. + + diff --git a/submissions/lab5.md b/submissions/lab5.md new file mode 100644 index 0000000..5720336 --- /dev/null +++ b/submissions/lab5.md @@ -0,0 +1,56 @@ +# Lab 5  CI/CD & GitOps + +## Task 1 CI Pipeline + ArgoCD + +I created GitHub Actions CI workflow (`.github/workflows/ci.yml`). + +The workflow finished successfully: + +* build Docker images +* push images to ghcr.io + +I installed ArgoCD and created Application `quickticket`. + +I tested GitOps workflow. + +When I push changes to Git repository, ArgoCD automatically deploy new version. + +## Task 2 Rollback via GitOps + +### 1. Deploy bad version + +I changed image tag in `k8s/gateway.yaml` to wrong tag. + +After git push, ArgoCD tried to sync application. + +Gateway pod went to `ImagePullBackOff` state. + +### 2. Rollback + +```bash +git revert HEAD --no-edit +git push origin main +``` + +ArgoCD automatically rollback changes. + +Application returned to Healthy status. + +Recovery time was about 1 to 2 minutes after git push. + +## Bonus Task + +I did not do bonus task because I had some problems with ArgoCD path configuration. + +But I understand the idea of automatic image tag updates. + +## Final + +In this lab I: + +* setup CI/CD pipeline with GitHub Actions +* installed ArgoCD +* used GitOps workflow +* tested rollback with git revert + +This lab helped me understand how modern deployment and rollback work in DevOps and SRE.