查询训练任务详情
更新时间:2024-12-27
描述
获取一个训练任务的详细信息。
请求结构
GET /api/v1/aijobs/{jobId}
Host:aihc.bj.baidubce.com
Authorization:authorization string
ContentType: application/json
请求头域
除公共头域外,无其它特殊头域。
请求参数
参数名称 | 类型 | 是否必须 | 参数位置 | 说明 |
---|---|---|---|---|
resourcePoolId | String | 是 | Query 参数 | 标识资源池的唯一标识符 |
jobId | String | 是 | Path 参数 | 训练任务ID |
返回头域
除公共头域,无其它特殊头域。
返回参数
参数名称 | 类型 | 说明 |
---|---|---|
requestId | String | 请求ID |
result | JobInfoResult | 成功请求时的返回结果 |
返回示例
{
"result": {
"jobId": "pytorchjob-19d38d07-3e04-49ef-8428-d792881fc5fa",
"name": "job-test-3",
"resourcePoolId": "cce-6zwnp4zf",
"command": "python -m torch.distributed.run /workspace/examples/imagenet.py --arch=resnet18 --epochs=100 --batch-size=32 --workers=0 /workspace/data/tiny-imagenet-200",
"createdAt": "2024-07-16T17:20:04Z",
"finishedAt": "",
"datasources": [],
"enableFaultTolerance": true,
"labels": [
{
"key": "aaaaa",
"value": "bbbb"
},
{
"key": "aijob.cce.baidubce.com/create-from-aihcp-api",
"value": "true"
},
{
"key": "aijob.cce.baidubce.com/openapi-jobid",
"value": "pytorchjob-19d38d07-3e04-49ef-8428-d792881fc5fa"
}
],
"priority": "normal",
"queue": "default",
"status": "Running",
"image": "registry.baidubce.com/cce-ai-native/cy-pytorch-mnist:etcd",
"resources": [
{
"name": "cpu",
"quantity": 1
}
],
"enableRDMA": false,
"queueingSequence": 1,
"podList": {
"listMeta": {
"totalItems": 1
},
"pods": [
{
"PodIP": "10.11.3.106",
"nodeName": "192.168.12.46",
"objectMeta": {
"annotations": {
"aijob.cce.baidubce.com/fault-tolerance-enabled": "true",
"aijob.cce.baidubce.com/openapi-jobid": "pytorchjob-19d38d07-3e04-49ef-8428-d792881fc5fa",
"aijob.cce.baidubce.com/raw-request": "{\"name\":\"job-test-3\",\"namespace\":\"default\",\"queue\":\"default\",\"priority\":\"normal\",\"oversell\":false,\"faultTolerance\":true,\"command\":\"python -m torch.distributed.run /workspace/examples/imagenet.py --arch=resnet18 --epochs=100 --batch-size=32 --workers=0 /workspace/data/tiny-imagenet-200\",\"datasources\":[],\"jobFramework\":\"PyTorchJob\",\"jobDistributed\":false,\"jobSpec\":{\"Master\":{\"replicas\":1,\"restartPolicy\":\"Never\",\"image\":\"registry.baidubce.com/cce-ai-native/cy-pytorch-mnist:etcd\",\"tag\":\"\",\"resource\":{\"cpu\":1},\"env\":{\"AIHC_JOB_NAME\":\"job-test-3\",\"AIHC_TENSORBOARD_LOG_PATH\":\"\",\"LOGLEVEL\":\"DEBUG\",\"NCCL_DEBUG\":\"INFO\"},\"command\":\"python -m torch.distributed.run /workspace/examples/imagenet.py --arch=resnet18 --epochs=100 --batch-size=32 --workers=0 /workspace/data/tiny-imagenet-200\",\"args\":\"\",\"postStart\":\"\",\"preStop\":\"\"}},\"imagePullSecrets\":null,\"imagePullSecretUsername\":\"\",\"imagePullSecretPassword\":\"\",\"labels\":{\"aaaaa\":\"bbbb\",\"aijob.cce.baidubce.com/create-from-aihcp-api\":\"true\"},\"annotations\":null,\"nodeSelector\":null,\"autoCreatePVC\":true,\"hostNetwork\":false,\"isCopyJob\":false,\"sourceJobName\":\"\",\"workloadType\":\"PyTorchJob\",\"pfsId\":\"\"}",
"cce-workload-kind": "PyTorchJob",
"cce-workload-name": "job-test-3",
"prometheus.io/path": "/metrics",
"prometheus.io/port": "9101",
"prometheus.io/scrape": "true",
"scheduling.k8s.io/group-name": "job-test-3",
"scheduling.k8s.io/job-enable-oversell": "false",
"volcano.sh/task-spec": "master"
},
"creationTimestamp": "2024-07-16T17:20:04Z",
"labels": {
"aaaaa": "bbbb",
"aijob.cce.baidubce.com/create-from-aihcp-api": "true",
"aijob.cce.baidubce.com/openapi-jobid": "pytorchjob-19d38d07-3e04-49ef-8428-d792881fc5fa",
"training.kubeflow.org/job-name": "job-test-3",
"training.kubeflow.org/job-role": "master",
"training.kubeflow.org/operator-name": "pytorchjob-controller",
"training.kubeflow.org/replica-index": "0",
"training.kubeflow.org/replica-type": "master"
},
"name": "job-test-3-master-0",
"namespace": "default",
"ownerReferences": [
{
"apiVersion": "kubeflow.org/v1",
"kind": "PyTorchJob",
"name": "job-test-3",
"uid": "b3212c83-ca27-4989-a346-bed704eba7eb",
"controller": true,
"blockOwnerDeletion": true
}
]
},
"podStatus": {
"podPhase": "Running",
"status": "Running"
},
"replicaType": "master",
"restartCount": 0,
"envs": [
{
"name": "LOGLEVEL",
"value": "DEBUG"
},
{
"name": "AIHC_JOB_NAME",
"value": "job-test-3"
},
{
"name": "NCCL_DEBUG",
"value": "INFO"
},
{
"name": "AIHC_TENSORBOARD_LOG_PATH",
"value": ""
},
{
"name": "BCCL_BUS_BW_CALCULATE_MODE",
"value": "Agg"
},
{
"name": "BCCL_PROFILING_FILE",
"value": "/var/logs/bccl/busbw.cal.%h.%p"
},
{
"name": "BCCL_UNIX_SOCKET_PATH",
"value": "/var/logs/bccl"
},
{
"name": "BCCL_TRACE_HANG_SIGNAL",
"value": "10"
},
{
"name": "PYTHONUNBUFFERED",
"value": "0"
},
{
"name": "MASTER_PORT",
"value": "23456"
},
{
"name": "MASTER_ADDR",
"value": "job-test-3-master-0"
},
{
"name": "WORLD_SIZE",
"value": "1"
},
{
"name": "RANK",
"value": "0"
},
{
"name": "NVIDIA_VISIBLE_DEVICES",
"value": "void"
}
]
}
]
}
},
"requestId": "4a516705-9c97-4e32-9473-c783ec85bec4"
}