分布式训练相关
更新时间:2025-11-26
查询训练任务列表
使用以下代码可以查询训练任务列表。
Python
1import logging
2import os
3
4import dotenv
5
6from baidubce.bce_client_configuration import BceClientConfiguration
7from baidubce.auth.bce_credentials import BceCredentials
8from baidubce.services.aihc.aihc_client import AihcClient
9from baidubce.exception import BceHttpClientError, BceServerError
10
11# 加载环境变量(可选,推荐使用)
12dotenv.load_dotenv()
13
14logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', force=True)
15logging.getLogger().setLevel(logging.INFO)
16logging.getLogger("baidubce").setLevel(logging.INFO)
17__logger = logging.getLogger(__name__)
18__logger.setLevel(logging.INFO)
19
20# 配置认证信息和服务端点
21HOST = os.getenv('HOST') or 'https://aihc.bj.baidubce.com' # 百舸AIHC服务地址
22AK = os.getenv('AK') or 'your-access-key-id' # 您的Access Key ID
23SK = os.getenv('SK') or 'your-secret-access-key' # 您的Secret Access Key
24
25# 创建BCE客户端配置
26config = BceClientConfiguration(credentials=BceCredentials(AK, SK), endpoint=HOST)
27
28# 创建 aihc client
29aihc_client = AihcClient(config)
30
31# 查询训练任务列表
32try:
33 __logger.info('--------------------------------DescribeJobs start--------------------------------')
34 resource_pool_id = "cce-xxx" # 替换为实际的资源池ID
35 keyword = ""
36 response = aihc_client.job.DescribeJobs(resourcePoolId=resource_pool_id, keyword=keyword)
37 print(response)
38except BceHttpClientError as e:
39 if isinstance(e.last_error, BceServerError):
40 __logger.error('send request failed. Response %s, code: %s, msg: %s'
41 % (e.last_error.status_code, e.last_error.code, str(e.last_error)))
42 else:
43 __logger.error('send request failed. Unknown exception: %s' % e)
注意: 根据接口文档去填写具体的访问参数,接口链接为查询训练任务列表
查询训练任务详情
使用以下代码可以查询训练任务详情。
Python
1import logging
2import os
3
4import dotenv
5
6from baidubce.bce_client_configuration import BceClientConfiguration
7from baidubce.auth.bce_credentials import BceCredentials
8from baidubce.services.aihc.aihc_client import AihcClient
9from baidubce.exception import BceHttpClientError, BceServerError
10
11# 加载环境变量
12dotenv.load_dotenv()
13
14logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', force=True)
15logging.getLogger().setLevel(logging.INFO)
16logging.getLogger("baidubce").setLevel(logging.INFO)
17__logger = logging.getLogger(__name__)
18__logger.setLevel(logging.INFO)
19
20# 配置认证信息
21HOST = os.getenv('HOST') or 'https://aihc.bj.baidubce.com'
22AK = os.getenv('AK') or 'your-access-key-id'
23SK = os.getenv('SK') or 'your-secret-access-key'
24
25config = BceClientConfiguration(credentials=BceCredentials(AK, SK), endpoint=HOST)
26aihc_client = AihcClient(config)
27
28# 查询训练任务详情
29try:
30 __logger.info('--------------------------------DescribeJob start--------------------------------')
31 resource_pool_id = "cce-xxx" # 替换为实际的资源池ID
32 queue_id = "default" # 替换为实际的队列ID或名称
33 job_id = "job-xxx" # 替换为实际的任务ID
34 need_detail = True
35 response = aihc_client.job.DescribeJob(resourcePoolId=resource_pool_id, queueID=queue_id, jobId=job_id,
36 needDetail=need_detail)
37 print(response)
38except BceHttpClientError as e:
39 if isinstance(e.last_error, BceServerError):
40 __logger.error('send request failed. Response %s, code: %s, msg: %s'
41 % (e.last_error.status_code, e.last_error.code, str(e.last_error)))
42 else:
43 __logger.error('send request failed. Unknown exception: %s' % e)
注意: 根据接口文档去填写具体的访问参数,接口链接为查询训练任务详情
删除训练任务
使用以下代码可以删除训练任务。
Python
1import logging
2import os
3
4import dotenv
5
6from baidubce.bce_client_configuration import BceClientConfiguration
7from baidubce.auth.bce_credentials import BceCredentials
8from baidubce.services.aihc.aihc_client import AihcClient
9from baidubce.exception import BceHttpClientError, BceServerError
10
11# 加载环境变量
12dotenv.load_dotenv()
13
14logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', force=True)
15logging.getLogger().setLevel(logging.INFO)
16logging.getLogger("baidubce").setLevel(logging.INFO)
17__logger = logging.getLogger(__name__)
18__logger.setLevel(logging.INFO)
19
20# 配置认证信息
21HOST = os.getenv('HOST') or 'https://aihc.bj.baidubce.com'
22AK = os.getenv('AK') or 'your-access-key-id'
23SK = os.getenv('SK') or 'your-secret-access-key'
24
25config = BceClientConfiguration(credentials=BceCredentials(AK, SK), endpoint=HOST)
26aihc_client = AihcClient(config)
27
28# 删除训练任务
29try:
30 __logger.info('----------------------------------DeleteJob start-----------------------------------')
31 resource_pool_id = "cce-xxx" # 替换为实际的资源池ID
32 job_id = "job-xxx" # 替换为实际的任务ID
33 response = aihc_client.job.DeleteJob(resourcePoolId=resource_pool_id, jobId=job_id)
34 print(response)
35except BceHttpClientError as e:
36 if isinstance(e.last_error, BceServerError):
37 __logger.error('send request failed. Response %s, code: %s, msg: %s'
38 % (e.last_error.status_code, e.last_error.code, str(e.last_error)))
39 else:
40 __logger.error('send request failed. Unknown exception: %s' % e)
注意: 根据接口文档去填写具体的访问参数,接口链接为删除训练任务
更新训练任务
使用以下代码可以更新训练任务(如修改优先级)。
Python
1import logging
2import os
3
4import dotenv
5
6from baidubce.bce_client_configuration import BceClientConfiguration
7from baidubce.auth.bce_credentials import BceCredentials
8from baidubce.services.aihc.aihc_client import AihcClient
9from baidubce.exception import BceHttpClientError, BceServerError
10
11# 加载环境变量
12dotenv.load_dotenv()
13
14logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', force=True)
15logging.getLogger().setLevel(logging.INFO)
16logging.getLogger("baidubce").setLevel(logging.INFO)
17__logger = logging.getLogger(__name__)
18__logger.setLevel(logging.INFO)
19
20# 配置认证信息
21HOST = os.getenv('HOST') or 'https://aihc.bj.baidubce.com'
22AK = os.getenv('AK') or 'your-access-key-id'
23SK = os.getenv('SK') or 'your-secret-access-key'
24
25config = BceClientConfiguration(credentials=BceCredentials(AK, SK), endpoint=HOST)
26aihc_client = AihcClient(config)
27
28# 更新训练任务
29try:
30 __logger.info('---------------------------------ModifyJob start----------------------------------')
31 resource_pool_id = "cce-xxx" # 替换为实际的资源池ID
32 job_id = "job-xxx" # 替换为实际的任务ID
33 priority = "high" # 新的优先级
34 response = aihc_client.job.ModifyJob(resourcePoolId=resource_pool_id, jobId=job_id, priority=priority)
35 print(response)
36except BceHttpClientError as e:
37 if isinstance(e.last_error, BceServerError):
38 __logger.error('send request failed. Response %s, code: %s, msg: %s'
39 % (e.last_error.status_code, e.last_error.code, str(e.last_error)))
40 else:
41 __logger.error('send request failed. Unknown exception: %s' % e)
注意: 根据接口文档去填写具体的访问参数,接口链接为更新训练任务
查询训练任务事件
使用以下代码可以查询训练任务事件。
Python
1import logging
2import os
3
4import dotenv
5
6from baidubce.bce_client_configuration import BceClientConfiguration
7from baidubce.auth.bce_credentials import BceCredentials
8from baidubce.services.aihc.aihc_client import AihcClient
9from baidubce.exception import BceHttpClientError, BceServerError
10
11# 加载环境变量
12dotenv.load_dotenv()
13
14logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', force=True)
15logging.getLogger().setLevel(logging.INFO)
16logging.getLogger("baidubce").setLevel(logging.INFO)
17__logger = logging.getLogger(__name__)
18__logger.setLevel(logging.INFO)
19
20# 配置认证信息
21HOST = os.getenv('HOST') or 'https://aihc.bj.baidubce.com'
22AK = os.getenv('AK') or 'your-access-key-id'
23SK = os.getenv('SK') or 'your-secret-access-key'
24
25config = BceClientConfiguration(credentials=BceCredentials(AK, SK), endpoint=HOST)
26aihc_client = AihcClient(config)
27
28# 查询训练任务事件
29try:
30 __logger.info('--------------------------DescribeJobEvents start-----------------------------------')
31 resource_pool_id = "cce-xxx" # 替换为实际的资源池ID
32 job_id = "job-xxx" # 替换为实际的任务ID
33 start_time = "1758532230" # 起始时间戳
34 end_time = "1758618650" # 结束时间戳
35 response = aihc_client.job.DescribeJobEvents(resourcePoolId=resource_pool_id, jobId=job_id,
36 startTime=start_time, endTime=end_time)
37 print(response)
38except BceHttpClientError as e:
39 if isinstance(e.last_error, BceServerError):
40 __logger.error('send request failed. Response %s, code: %s, msg: %s'
41 % (e.last_error.status_code, e.last_error.code, str(e.last_error)))
42 else:
43 __logger.error('send request failed. Unknown exception: %s' % e)
注意: 根据接口文档去填写具体的访问参数,接口链接为查询训练任务事件
查询训练任务日志
使用以下代码可以查询训练任务日志。
Python
1import logging
2import os
3
4import dotenv
5
6from baidubce.bce_client_configuration import BceClientConfiguration
7from baidubce.auth.bce_credentials import BceCredentials
8from baidubce.services.aihc.aihc_client import AihcClient
9from baidubce.exception import BceHttpClientError, BceServerError
10
11# 加载环境变量
12dotenv.load_dotenv()
13
14logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', force=True)
15logging.getLogger().setLevel(logging.INFO)
16logging.getLogger("baidubce").setLevel(logging.INFO)
17__logger = logging.getLogger(__name__)
18__logger.setLevel(logging.INFO)
19
20# 配置认证信息
21HOST = os.getenv('HOST') or 'https://aihc.bj.baidubce.com'
22AK = os.getenv('AK') or 'your-access-key-id'
23SK = os.getenv('SK') or 'your-secret-access-key'
24
25config = BceClientConfiguration(credentials=BceCredentials(AK, SK), endpoint=HOST)
26aihc_client = AihcClient(config)
27
28# 查询训练任务日志
29try:
30 __logger.info('--------------------------DescribeJobLogs start---------------------------------')
31 resource_pool_id = "cce-xxx" # 替换为实际的资源池ID
32 job_id = "job-xxx" # 替换为实际的任务ID
33 pod_name = "xxx-test-copy2-master-0" # 替换为实际的pod名称
34 keywords = "xxx" # 搜索关键字
35 response = aihc_client.job.DescribeJobLogs(resourcePoolId=resource_pool_id,
36 jobId=job_id,
37 keywords=keywords,
38 podName=pod_name)
39 print(response)
40except BceHttpClientError as e:
41 if isinstance(e.last_error, BceServerError):
42 __logger.error('send request failed. Response %s, code: %s, msg: %s'
43 % (e.last_error.status_code, e.last_error.code, str(e.last_error)))
44 else:
45 __logger.error('send request failed. Unknown exception: %s' % e)
注意: 根据接口文档去填写具体的访问参数,接口链接为查询训练任务日志
查询训练任务Pod事件
使用以下代码可以查询训练任务Pod事件。
Python
1import logging
2import os
3
4import dotenv
5
6from baidubce.bce_client_configuration import BceClientConfiguration
7from baidubce.auth.bce_credentials import BceCredentials
8from baidubce.services.aihc.aihc_client import AihcClient
9from baidubce.exception import BceHttpClientError, BceServerError
10
11# 加载环境变量
12dotenv.load_dotenv()
13
14logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', force=True)
15logging.getLogger().setLevel(logging.INFO)
16logging.getLogger("baidubce").setLevel(logging.INFO)
17__logger = logging.getLogger(__name__)
18__logger.setLevel(logging.INFO)
19
20# 配置认证信息
21HOST = os.getenv('HOST') or 'https://aihc.bj.baidubce.com'
22AK = os.getenv('AK') or 'your-access-key-id'
23SK = os.getenv('SK') or 'your-secret-access-key'
24
25config = BceClientConfiguration(credentials=BceCredentials(AK, SK), endpoint=HOST)
26aihc_client = AihcClient(config)
27
28# 查询训练任务Pod事件
29try:
30 __logger.info('---------------DescribeJobPodEvents start---------------------------')
31 resource_pool_id = "cce-xxx" # 替换为实际的资源池ID
32 pod_name = "job-xxx-master-0" # 替换为实际的pod名称
33 job_id = "job-xxx" # 替换为实际的任务ID
34 response = aihc_client.job.DescribeJobPodEvents(resourcePoolId=resource_pool_id,
35 jobId=job_id,
36 podName=pod_name)
37 print(response)
38except BceHttpClientError as e:
39 if isinstance(e.last_error, BceServerError):
40 __logger.error('send request failed. Response %s, code: %s, msg: %s'
41 % (e.last_error.status_code, e.last_error.code, str(e.last_error)))
42 else:
43 __logger.error('send request failed. Unknown exception: %s' % e)
注意: 根据接口文档去填写具体的访问参数,接口链接为查询训练任务Pod事件
停止训练任务
使用以下代码可以停止训练任务。
Python
1import logging
2import os
3
4import dotenv
5
6from baidubce.bce_client_configuration import BceClientConfiguration
7from baidubce.auth.bce_credentials import BceCredentials
8from baidubce.services.aihc.aihc_client import AihcClient
9from baidubce.exception import BceHttpClientError, BceServerError
10
11# 加载环境变量
12dotenv.load_dotenv()
13
14logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', force=True)
15logging.getLogger().setLevel(logging.INFO)
16logging.getLogger("baidubce").setLevel(logging.INFO)
17__logger = logging.getLogger(__name__)
18__logger.setLevel(logging.INFO)
19
20# 配置认证信息
21HOST = os.getenv('HOST') or 'https://aihc.bj.baidubce.com'
22AK = os.getenv('AK') or 'your-access-key-id'
23SK = os.getenv('SK') or 'your-secret-access-key'
24
25config = BceClientConfiguration(credentials=BceCredentials(AK, SK), endpoint=HOST)
26aihc_client = AihcClient(config)
27
28# 停止训练任务
29try:
30 __logger.info('stop job')
31 resource_pool_id = "cce-xxx" # 替换为实际的资源池ID
32 job_id = "job-xxx" # 替换为实际的任务ID
33 response = aihc_client.job.StopJob(resourcePoolId=resource_pool_id, jobId=job_id)
34 print(response)
35except BceHttpClientError as e:
36 if isinstance(e.last_error, BceServerError):
37 __logger.error('send request failed. Response %s, code: %s, msg: %s'
38 % (e.last_error.status_code, e.last_error.code, str(e.last_error)))
39 else:
40 __logger.error('send request failed. Unknown exception: %s' % e)
注意: 根据接口文档去填写具体的访问参数,接口链接为停止训练任务
查询训练任务所在节点列表
使用以下代码可以查询训练任务所在节点列表。
Python
1import logging
2import os
3
4import dotenv
5
6from baidubce.bce_client_configuration import BceClientConfiguration
7from baidubce.auth.bce_credentials import BceCredentials
8from baidubce.services.aihc.aihc_client import AihcClient
9from baidubce.exception import BceHttpClientError, BceServerError
10
11# 加载环境变量
12dotenv.load_dotenv()
13
14logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', force=True)
15logging.getLogger().setLevel(logging.INFO)
16logging.getLogger("baidubce").setLevel(logging.INFO)
17__logger = logging.getLogger(__name__)
18__logger.setLevel(logging.INFO)
19
20# 配置认证信息
21HOST = os.getenv('HOST') or 'https://aihc.bj.baidubce.com'
22AK = os.getenv('AK') or 'your-access-key-id'
23SK = os.getenv('SK') or 'your-secret-access-key'
24
25config = BceClientConfiguration(credentials=BceCredentials(AK, SK), endpoint=HOST)
26aihc_client = AihcClient(config)
27
28# 查询训练任务所在节点列表
29try:
30 __logger.info('-------------------DescribeJobNodes start--------------------------')
31 resource_pool_id = "cce-xxx" # 替换为实际的资源池ID
32 job_id = "job-xxx" # 替换为实际的任务ID
33 response = aihc_client.job.DescribeJobNodes(resourcePoolId=resource_pool_id, jobId=job_id)
34 print(response)
35except BceHttpClientError as e:
36 if isinstance(e.last_error, BceServerError):
37 __logger.error('send request failed. Response %s, code: %s, msg: %s'
38 % (e.last_error.status_code, e.last_error.code, str(e.last_error)))
39 else:
40 __logger.error('send request failed. Unknown exception: %s' % e)
注意: 根据接口文档去填写具体的访问参数,接口链接为查询训练任务所在节点列表
获取训练任务WebTerminal地址
使用以下代码可以获取训练任务WebTerminal地址。
Python
1import logging
2import os
3
4import dotenv
5
6from baidubce.bce_client_configuration import BceClientConfiguration
7from baidubce.auth.bce_credentials import BceCredentials
8from baidubce.services.aihc.aihc_client import AihcClient
9from baidubce.exception import BceHttpClientError, BceServerError
10
11# 加载环境变量
12dotenv.load_dotenv()
13
14logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', force=True)
15logging.getLogger().setLevel(logging.INFO)
16logging.getLogger("baidubce").setLevel(logging.INFO)
17__logger = logging.getLogger(__name__)
18__logger.setLevel(logging.INFO)
19
20# 配置认证信息
21HOST = os.getenv('HOST') or 'https://aihc.bj.baidubce.com'
22AK = os.getenv('AK') or 'your-access-key-id'
23SK = os.getenv('SK') or 'your-secret-access-key'
24
25config = BceClientConfiguration(credentials=BceCredentials(AK, SK), endpoint=HOST)
26aihc_client = AihcClient(config)
27
28# 获取训练任务WebTerminal地址
29try:
30 __logger.info('-------------------DescribeJobWebterminal start--------------------------')
31 resource_pool_id = "cce-xxx" # 替换为实际的资源池ID
32 job_id = "job-xxx" # 替换为实际的任务ID
33 podName = "xxx-test-bb1-rerun1-master-0" # 替换为实际的pod名称
34 handshake_timeout_second = "30"
35 ping_timeout_second = "900"
36 response = aihc_client.job.DescribeJobWebterminal(
37 resourcePoolId=resource_pool_id,
38 jobId=job_id,
39 podName=podName,
40 pingTimeoutSecond=ping_timeout_second,
41 handshakeTimeoutSecond=handshake_timeout_second,
42 )
43 print(response)
44except BceHttpClientError as e:
45 if isinstance(e.last_error, BceServerError):
46 __logger.error('send request failed. Response %s, code: %s, msg: %s'
47 % (e.last_error.status_code, e.last_error.code, str(e.last_error)))
48 else:
49 __logger.error('send request failed. Unknown exception: %s' % e)
注意: 根据接口文档去填写具体的访问参数,接口链接为获取训练任务WebTerminal地址
创建训练任务
使用以下代码可以创建训练任务。
Python
1import logging
2import os
3
4import dotenv
5
6from baidubce.bce_client_configuration import BceClientConfiguration
7from baidubce.auth.bce_credentials import BceCredentials
8from baidubce.services.aihc.aihc_client import AihcClient
9from baidubce.services.aihc.modules.job.job_model import Datasource, Env, JobSpec
10from baidubce.exception import BceHttpClientError, BceServerError
11
12# 加载环境变量
13dotenv.load_dotenv()
14
15logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', force=True)
16logging.getLogger().setLevel(logging.INFO)
17logging.getLogger("baidubce").setLevel(logging.INFO)
18__logger = logging.getLogger(__name__)
19__logger.setLevel(logging.INFO)
20
21# 配置认证信息
22HOST = os.getenv('HOST') or 'https://aihc.bj.baidubce.com'
23AK = os.getenv('AK') or 'your-access-key-id'
24SK = os.getenv('SK') or 'your-secret-access-key'
25
26config = BceClientConfiguration(credentials=BceCredentials(AK, SK), endpoint=HOST)
27aihc_client = AihcClient(config)
28
29# 创建训练任务
30try:
31 __logger.info('---------------------------------CreateJob start------------------------------')
32 resource_pool_id = "cce-xxx" # 替换为实际的资源池ID
33 queue_id = "default" # 替换为实际的队列ID或名称
34 name = "python-sdk-test-xxx" # 任务名称
35 command = "sleep 5m" # 启动命令
36
37 # 配置环境变量
38 envs = [Env(name="NCCL_DEBUG", value="DEBUG"), Env(name="NCCL_IB_DISABLE", value="0")]
39
40 # 配置数据源
41 data_source = Datasource(type="pfs", name="pfs-pxE6jz", mountPath="/mnt/cluster")
42
43 # 配置任务规格
44 job_spec = JobSpec(
45 image="registry.baidubce.com/aihc-aiak/aiak-megatron:ubuntu20.04-cu11.8-torch1.14.0-py38_v1.2.7.12_release",
46 replicas=1,
47 resources=[],
48 envs=envs,
49 enableRDMA=False
50 )
51
52 job_type = "PyTorchJob"
53 labels = []
54 priority = "normal"
55 dataSources = [
56 data_source,
57 ]
58 enable_bccl = False
59 fault_tolerance = False
60 fault_tolerance_args = {}
61 tensorboard_config = {}
62 retention_period = "5m"
63
64 response = aihc_client.job.CreateJob(
65 resourcePoolId=resource_pool_id,
66 queueID=queue_id,
67 name=name,
68 command=command,
69 jobSpec=job_spec,
70 jobType=job_type,
71 labels=labels,
72 priority=priority,
73 dataSources=dataSources,
74 enableBccl=enable_bccl,
75 faultTolerance=fault_tolerance,
76 faultToleranceArgs=fault_tolerance_args,
77 tensorboardConfig=tensorboard_config,
78 retentionPeriod=retention_period,
79 )
80 print(response)
81except BceHttpClientError as e:
82 if isinstance(e.last_error, BceServerError):
83 __logger.error('send request failed. Response %s, code: %s, msg: %s'
84 % (e.last_error.status_code, e.last_error.code, str(e.last_error)))
85 else:
86 __logger.error('send request failed. Unknown exception: %s' % e)
注意: 根据接口文档去填写具体的访问参数,接口链接为创建训练任务
查询训练任务监控
使用以下代码可以查询训练任务监控数据。
Python
1import logging
2import os
3
4import dotenv
5
6from baidubce.bce_client_configuration import BceClientConfiguration
7from baidubce.auth.bce_credentials import BceCredentials
8from baidubce.services.aihc.aihc_client import AihcClient
9from baidubce.exception import BceHttpClientError, BceServerError
10
11# 加载环境变量
12dotenv.load_dotenv()
13
14logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', force=True)
15logging.getLogger().setLevel(logging.INFO)
16logging.getLogger("baidubce").setLevel(logging.INFO)
17__logger = logging.getLogger(__name__)
18__logger.setLevel(logging.INFO)
19
20# 配置认证信息
21HOST = os.getenv('HOST') or 'https://aihc.bj.baidubce.com'
22AK = os.getenv('AK') or 'your-access-key-id'
23SK = os.getenv('SK') or 'your-secret-access-key'
24
25config = BceClientConfiguration(credentials=BceCredentials(AK, SK), endpoint=HOST)
26aihc_client = AihcClient(config)
27
28# 查询训练任务监控
29try:
30 __logger.info('--------------------DescribeJobMetrics start------------------------')
31 resource_pool_id = "cce-xxx" # 替换为实际的资源池ID
32 job_id = "job-xxx" # 替换为实际的任务ID
33 start_time = "1758359060" # 起始时间戳
34 end_time = "1758445563" # 结束时间戳
35 time_step = "5m" # 监控数据间隔
36 metric_type = "GpuUsage" # 监控指标类型
37 rate_interval = "5m" # 指标变化周期频率
38 response = aihc_client.job.DescribeJobMetrics(resourcePoolId=resource_pool_id,
39 jobId=job_id,
40 metricType=metric_type,
41 startTime=start_time,
42 endTime=end_time,
43 timeStep=time_step,
44 rateInterval=rate_interval)
45 print(response)
46except BceHttpClientError as e:
47 if isinstance(e.last_error, BceServerError):
48 __logger.error('send request failed. Response %s, code: %s, msg: %s'
49 % (e.last_error.status_code, e.last_error.code, str(e.last_error)))
50 else:
51 __logger.error('send request failed. Unknown exception: %s' % e)
注意: 根据接口文档去填写具体的访问参数,接口链接为查询训练任务监控
