-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdocker-compose.yml
More file actions
134 lines (128 loc) · 3.83 KB
/
docker-compose.yml
File metadata and controls
134 lines (128 loc) · 3.83 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
services:
# DGPU Scheduler Master
scheduler-master:
build:
context: .
dockerfile: deployments/docker/Dockerfile.scheduler
container_name: dgpu-scheduler-master
ports:
- "8080:8080" # REST API
- "9090:9090" # gRPC API
environment:
- SCHEDULER_ROLE=master
- LOG_LEVEL=info
volumes:
- scheduler_state:/var/lib/dgpu-scheduler/state
- ./configs/scheduler.yaml:/home/scheduler/configs/scheduler.yaml:ro
restart: unless-stopped
networks:
- dgpu-network
healthcheck:
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:8080/health"]
interval: 30s
timeout: 10s
retries: 3
start_period: 40s
# DGPU Scheduler Standby (可选)
scheduler-standby:
build:
context: .
dockerfile: deployments/docker/Dockerfile.scheduler
container_name: dgpu-scheduler-standby
ports:
- "8081:8080" # REST API (不同端口)
- "9091:9090" # gRPC API (不同端口)
environment:
- SCHEDULER_ROLE=standby
- MASTER_ADDRESS=scheduler-master:9090
- LOG_LEVEL=info
volumes:
- standby_state:/var/lib/dgpu-scheduler/state
- ./configs/scheduler.yaml:/home/scheduler/configs/scheduler.yaml:ro
restart: unless-stopped
networks:
- dgpu-network
depends_on:
- scheduler-master
profiles:
- ha # 使用 --profile ha 启用高可用
# DGPU Agent (模拟 GPU 节点)
agent-node-1:
build:
context: .
dockerfile: deployments/docker/Dockerfile.agent
container_name: dgpu-agent-1
environment:
- SCHEDULER_MASTER_ADDRESS=scheduler-master:9090
- SCHEDULER_STANDBY_ADDRESS=scheduler-standby:9090
- AGENT_ID=agent-1
- GPU_DETECTION_METHOD=nvidia-smi
- LOG_LEVEL=info
- PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/app/test-local
volumes:
- /var/run/docker.sock:/var/run/docker.sock # Docker 访问
- agent_1_logs:/var/lib/dgpu-agent/logs
- ./configs/agent.yaml:/home/agent/configs/agent.yaml:ro
- ./test-local:/app/test-local:ro # 挂载模拟 GPU 脚本
restart: unless-stopped
networks:
- dgpu-network
depends_on:
- scheduler-master
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
# 可以添加更多 Agent 节点
agent-node-2:
build:
context: .
dockerfile: deployments/docker/Dockerfile.agent
container_name: dgpu-agent-2
environment:
- SCHEDULER_MASTER_ADDRESS=scheduler-master:9090
- SCHEDULER_STANDBY_ADDRESS=scheduler-standby:9090
- AGENT_ID=agent-2
- GPU_DETECTION_METHOD=nvidia-smi
- LOG_LEVEL=info
- PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/app/test-local
volumes:
- /var/run/docker.sock:/var/run/docker.sock
- agent_2_logs:/var/lib/dgpu-agent/logs
- ./configs/agent.yaml:/home/agent/configs/agent.yaml:ro
- ./test-local:/app/test-local:ro
restart: unless-stopped
networks:
- dgpu-network
depends_on:
- scheduler-master
profiles:
- scale # 使用 --profile scale 启用多节点
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
networks:
dgpu-network:
driver: bridge
name: dgpu-scheduler-network
volumes:
scheduler_state:
driver: local
standby_state:
driver: local
agent_1_logs:
driver: local
agent_2_logs:
driver: local
# 使用说明:
# 基础启动: docker-compose up -d
# 高可用模式: docker-compose --profile ha up -d
# 多节点模式: docker-compose --profile scale up -d
# 完整模式: docker-compose --profile ha --profile scale up -d