查询训练任务详情
更新时间:2025-05-28
描述
获取一个训练任务的详细信息。
请求结构
Bash
1POST ?action=DescribeJob&resourcePoolId=xxxx
2Host:aihc.bj.baidubce.com
3Authorization:authorization string
4ContentType: application/json
5X-API-Version: v2
请求头域
除公共头域外,无其它特殊头域。
请求参数
参数名称 | 类型 | 是否必须 | 参数位置 | 说明 |
---|---|---|---|---|
resourcePoolId | String | 是 | Query 参数 | 标识资源池的唯一标识符 |
jobId | String | 是 | Body 参数 | 训练任务ID |
needDetail | Bool | 否 | Body 参数 | 是否需要详细信息,值为true时将返回Pod及历史Pod列表 |
返回头域
除公共头域,无其它特殊头域。
返回参数
参数名称 | 类型 | 说明 |
---|---|---|
requestId | String | 请求ID |
job | JobItem | 成功请求时的返回结果 |
返回示例
JSON
1{
2 "requestId": "4e0f0e6d-f20b-43c0-b86a-6eac283d03d8",
3 "jobId": "job-e5MZV1Yxw1su",
4 "userId": "eca97e148cb74e9683d7b7240829d1ff",
5 "name": "test-training-job-0513-3",
6 "status": "Running",
7 "createdAt": "2025-05-13T02:28:26Z",
8 "finishedAt": "",
9 "jobType": "pytorch",
10 "resourcePoolId": "cce-cm1jjxrq",
11 "queueId": "default",
12 "jobSpec": {
13 "image": "registry.baidubce.com/aihc-aiak/aiak-megatron:ubuntu20.04-cu11.8-torch1.14.0-py38_v1.2.7.12_release",
14 "imageConfig": {
15 "username": "",
16 "password": ""
17 },
18 "replicas": 2,
19 "resources": [],
20 "envs": [
21 {
22 "name": "AIHC_JOB_NAME",
23 "value": "test-training-job-0513-3"
24 },
25 {
26 "name": "AIHC_TENSORBOARD_LOG_PATH",
27 "value": ""
28 },
29 {
30 "name": "CUDA_DEVICE_MAX_CONNECTIONS",
31 "value": "1"
32 },
33 {
34 "name": "NCCL_DEBUG",
35 "value": "INFO"
36 }
37 ],
38 "enableRDMA": false,
39 "hostNetwork": false
40 },
41 "command": "sleep 1d",
42 "labels": [
43 {
44 "key": "aijob.cce.baidubce.com/ai-user-id",
45 "value": "eca97e148cb74e9683d7b7240829d1ff"
46 },
47 {
48 "key": "aijob.cce.baidubce.com/ai-user-name",
49 "value": "root"
50 },
51 {
52 "key": "aijob.cce.baidubce.com/create-from-aihcp-api",
53 "value": "true"
54 },
55 {
56 "key": "aijob.cce.baidubce.com/openapi-jobid",
57 "value": "job-e5MZV1Yxw1su"
58 }
59 ],
60 "priority": "normal",
61 "dataSources": [],
62 "enableBccl": false,
63 "enableBcclErrorReason": "",
64 "enableFaultTolerant": false,
65 "faultTolerantArgs": "--enable-hang-detection=false --max-num-of-unconditional-retry=0",
66 "pods": [
67 {
68 "PodIP": "10.15.50.82",
69 "nodeName": "192.168.12.157",
70 "creationTimestamp": "2025-05-13T02:28:29Z",
71 "uid": "fcc1e56e-ecd4-4ea7-a5cd-fcc4e5e24509",
72 "name": "test-training-job-0513-3-worker-0",
73 "podPhase": "Running",
74 "status": "Running",
75 "replicaType": "worker",
76 "restartCount": 0,
77 "envs": [
78 {
79 "name": "AIHC_TENSORBOARD_LOG_PATH",
80 "value": ""
81 },
82 {
83 "name": "CUDA_DEVICE_MAX_CONNECTIONS",
84 "value": "1"
85 },
86 {
87 "name": "AIHC_JOB_NAME",
88 "value": "test-training-job-0513-3"
89 },
90 {
91 "name": "NCCL_DEBUG",
92 "value": "INFO"
93 },
94 {
95 "name": "BCCL_BUS_BW_CALCULATE_MODE",
96 "value": "Agg"
97 },
98 {
99 "name": "BCCL_PROFILING_FILE",
100 "value": "/var/bccl/logs/busbw.cal.%h.%p"
101 },
102 {
103 "name": "BCCL_ERROR_FILE",
104 "value": "/var/bccl/logs/err.%h.%p.log"
105 },
106 {
107 "name": "BCCL_TRACE_HANG_ENABLE",
108 "value": "1"
109 },
110 {
111 "name": "BCCL_HANG_DETECT_INTERVAL",
112 "value": "30"
113 },
114 {
115 "name": "BCCL_UNIX_SOCKET_PATH",
116 "value": "/var/bccl/sockets"
117 },
118 {
119 "name": "BCCL_TRACE_SLOW_ENABLE",
120 "value": "1"
121 },
122 {
123 "name": "NODE_NAME",
124 "value": ""
125 },
126 {
127 "name": "JOB_UID",
128 "value": "8b7561fb-93e2-4891-afff-0f109fd74387"
129 },
130 {
131 "name": "JOB_NAME",
132 "value": "test-training-job-0513-3"
133 },
134 {
135 "name": "POD_NAME",
136 "value": ""
137 },
138 {
139 "name": "CLUSTER_ID",
140 "value": "cce-cm1jjxrq"
141 },
142 {
143 "name": "PYTHONUNBUFFERED",
144 "value": "0"
145 },
146 {
147 "name": "MASTER_PORT",
148 "value": "23456"
149 },
150 {
151 "name": "MASTER_ADDR",
152 "value": "test-training-job-0513-3-master-0"
153 },
154 {
155 "name": "WORLD_SIZE",
156 "value": "2"
157 },
158 {
159 "name": "RANK",
160 "value": "1"
161 }
162 ],
163 "finishedAt": "",
164 "reason": ""
165 },
166 {
167 "PodIP": "10.15.50.6",
168 "nodeName": "192.168.12.157",
169 "creationTimestamp": "2025-05-13T02:28:29Z",
170 "uid": "bdb9749e-1e62-45e4-a8b0-0d04cd07c899",
171 "name": "test-training-job-0513-3-master-0",
172 "podPhase": "Running",
173 "status": "Running",
174 "replicaType": "master",
175 "restartCount": 0,
176 "envs": [
177 {
178 "name": "AIHC_JOB_NAME",
179 "value": "test-training-job-0513-3"
180 },
181 {
182 "name": "NCCL_DEBUG",
183 "value": "INFO"
184 },
185 {
186 "name": "AIHC_TENSORBOARD_LOG_PATH",
187 "value": ""
188 },
189 {
190 "name": "CUDA_DEVICE_MAX_CONNECTIONS",
191 "value": "1"
192 },
193 {
194 "name": "BCCL_BUS_BW_CALCULATE_MODE",
195 "value": "Agg"
196 },
197 {
198 "name": "BCCL_PROFILING_FILE",
199 "value": "/var/bccl/logs/busbw.cal.%h.%p"
200 },
201 {
202 "name": "BCCL_ERROR_FILE",
203 "value": "/var/bccl/logs/err.%h.%p.log"
204 },
205 {
206 "name": "BCCL_TRACE_HANG_ENABLE",
207 "value": "1"
208 },
209 {
210 "name": "BCCL_HANG_DETECT_INTERVAL",
211 "value": "30"
212 },
213 {
214 "name": "BCCL_UNIX_SOCKET_PATH",
215 "value": "/var/bccl/sockets"
216 },
217 {
218 "name": "BCCL_TRACE_SLOW_ENABLE",
219 "value": "1"
220 },
221 {
222 "name": "NODE_NAME",
223 "value": ""
224 },
225 {
226 "name": "JOB_UID",
227 "value": "8b7561fb-93e2-4891-afff-0f109fd74387"
228 },
229 {
230 "name": "JOB_NAME",
231 "value": "test-training-job-0513-3"
232 },
233 {
234 "name": "POD_NAME",
235 "value": ""
236 },
237 {
238 "name": "CLUSTER_ID",
239 "value": "cce-cm1jjxrq"
240 },
241 {
242 "name": "PYTHONUNBUFFERED",
243 "value": "0"
244 },
245 {
246 "name": "MASTER_PORT",
247 "value": "23456"
248 },
249 {
250 "name": "MASTER_ADDR",
251 "value": "test-training-job-0513-3-master-0"
252 },
253 {
254 "name": "WORLD_SIZE",
255 "value": "2"
256 },
257 {
258 "name": "RANK",
259 "value": "0"
260 }
261 ],
262 "finishedAt": "",
263 "reason": ""
264 }
265 ]
266}