查看模型评估报告
更新时间:2025-07-03
功能介绍
用于获取评估报告(整体指标)。
使用说明
本文API支持通过Python SDK、Go SDK、Java SDK和Node.js SDK调用,调用流程请参考SDK安装及使用流程。
SDK调用
调用示例
1import os
2from qianfan import resources
3
4# 通过环境变量初始化认证信息
5# 使用安全认证AK/SK调用,替换下列示例中参数,安全认证Access Key替换your_iam_ak,Secret Key替换your_iam_sk,如何获取请查看https://cloud.baidu.com/doc/Reference/s/9jwvz2egb
6os.environ["QIANFAN_ACCESS_KEY"] = "your_iam_ak"
7os.environ["QIANFAN_SECRET_KEY"] = "your_iam_sk"
8
9
10
11resp = resources.console.utils.call_action(
12 # 调用本文API,该参数值为固定值,无需修改;对应API调用文档-请求结构-请求地址的后缀
13 "/wenxinworkshop/modelrepo/eval/report", "",
14 # 请查看本文请求参数说明,根据实际使用选择参数;对应API调用文档-请求参数-Body参数
15 {
16 "id":"ame-vwgs2ybhyhfv"
17 }
18
19)
20
21print(resp.body)
1package main
2import (
3 "context"
4 "fmt"
5 "os"
6 "github.com/baidubce/bce-qianfan-sdk/go/qianfan"
7)
8func main() {
9 // 使用安全认证AK/SK鉴权,通过环境变量初始化;替换下列示例中参数,安全认证Access Key替换your_iam_ak,Secret Key替换your_iam_sk
10 os.Setenv("QIANFAN_ACCESS_KEY", "your_iam_ak")
11 os.Setenv("QIANFAN_SECRET_KEY", "your_iam_sk")
12
13 ca := qianfan.NewConsoleAction()
14
15 res, err := ca.Call(context.TODO(),
16 // 调用本文API,该参数值为固定值,无需修改;对应API调用文档-请求结构-请求地址的后缀
17 "/wenxinworkshop/modelrepo/eval/report", "",
18 // 请查看本文请求参数说明,根据实际使用选择参数;对应API调用文档-请求参数-Body参数
19 map[string]interface{}{
20 "id":"ame-vwgs2ybhyhfv",
21 })
22 if err != nil {
23 panic(err)
24 }
25 fmt.Println(string(res.Body))
26
27}
1import com.baidubce.qianfan.Qianfan;
2import com.baidubce.qianfan.model.console.ConsoleResponse;
3import com.baidubce.qianfan.util.CollUtils;
4import com.baidubce.qianfan.util.Json;
5import java.util.Map;
6
7public class Dome {
8 public static void main(String args[]){
9 // 使用安全认证AK/SK鉴权,替换下列示例中参数,安全认证Access Key替换your_iam_ak,Secret Key替换your_iam_sk
10 Qianfan qianfan = new Qianfan("your_iam_ak", "your_iam_sk");
11
12 ConsoleResponse<List<Map<String, Object>>> response = qianfan.console()
13 // 调用本文API,该参数值为固定值,无需修改;对应API调用文档-请求结构-请求地址的后缀
14 .route("/wenxinworkshop/modelrepo/eval/report")
15 // 需要传入参数的场景,可以自行封装请求类,或者使用Map.of()来构建请求Body
16 // Java 8可以使用SDK提供的CollUtils.mapOf()来替代Map.of()
17 // 请查看本文请求参数说明,根据实际使用选择参数;对应API调用文档-请求参数-Body参数
18 .body(CollUtils.mapOf(
19 "id","ame-vwgs2ybhyhfv"
20 ))
21 .execute(new TypeRef<List<Map<String, Object>>>() {});
22
23 System.out.println(Json.serialize(response));
24 }
25}
1import {consoleAction, setEnvVariable} from "@baiducloud/qianfan";
2
3// 使用安全认证AK/SK鉴权,通过环境变量初始化;替换下列示例中参数,安全认证Access Key替换your_iam_ak,Secret Key替换your_iam_sk
4setEnvVariable('QIANFAN_ACCESS_KEY','your_iam_ak');
5setEnvVariable('QIANFAN_SECRET_KEY','your_iam_sk');
6
7async function main() {
8 //base_api_route:调用本文API,该参数值为固定值,无需修改;对应API调用文档-请求结构-请求地址的后缀
9 //data:请查看本文请求参数说明,根据实际使用选择参数;对应API调用文档-请求参数-Body参数
10 const res = await consoleAction({base_api_route: '/wenxinworkshop/modelrepo/eval/report', data: {
11 "id":"ame-vwgs2ybhyhfv"
12 }
13 });
14
15 console.log(res);
16}
17
18main();
返回示例
1{
2 "log_id": "3617826755",
3 "result": [
4 {
5 "modelName": "llama2_7b_32k_z_sft",
6 "modelVersion": "1",
7 "modelVersionSource": "Train",
8 "evalMode": "manual",
9 "evaluationName": "cl_联调_模型评估_用户bos",
10 "id": "65eae1fb1xxx9ca97a1",
11 "modelVersionId": 833,
12 "modelId": 591,
13 "userId": 1,
14 "evaluationId": 401,
15 "modelForm": "model",
16 "modelIdStr": "am-dkxwxxxxjgw",
17 "modelVersionIdStr": "amv-7ab3xxxtspe1",
18 "evaluationIdStr": "ame-28zxxx2rn4",
19 "evalUnitId": "ameu-gpvzxxxs0n",
20 "inferDatasetId": "ds-p79kyxxx7sbk",
21 "inferDatasetName": "cl_联调_模型评估_用户bos_llama2_7b_32k_z_sft_V1_jmrr",
22 "effectMetric": {
23 "accuracy": 0,
24 "f1Score": 0,
25 "rouge_1": 0,
26 "rouge_2": 0,
27 "rouge_l": 0,
28 "bleu4": 0,
29 "avgJudgeScore": 0,
30 "stdJudgeScore": 0,
31 "medianJudgeScore": 0,
32 "scoreDistribution": null,
33 "manualAvgScore": 0.5,
34 "goodCaseProportion": 0,
35 "subjectiveImpression": "1",
36 "manualScoreDistribution": [
37 {
38 "dimension": "满意度",
39 "scoreDistribution": {
40 "-1": 2,
41 "1": 1
42 }
43 },
44 {
45 "dimension": "安全性",
46 "scoreDistribution": {
47 "-1": 2,
48 "0": 1
49 }
50 }
51 ]
52 },
53 "performanceMetric": {}
54 },
55 {
56 "modelName": "mixtral2",
57 "modelVersion": "8",
58 "modelVersionSource": "Train",
59 "evalMode": "manual",
60 "evaluationName": "cl_联调_模型评估_用户bos",
61 "id": "65eae45dxxxcab739",
62 "modelVersionId": 7xx,
63 "modelId": 545,
64 "userId": 1,
65 "evaluationId": 401,
66 "modelForm": "model",
67 "modelIdStr": "am-ktcxxx88z",
68 "modelVersionIdStr": "amv-g2acxxxg9v",
69 "evaluationIdStr": "ame-28zxxx2rn4",
70 "evalUnitId": "ameu-1uxpxxx8uc2",
71 "inferDatasetId": "ds-ba82xxxguh",
72 "inferDatasetName": "cl_联调_模型评估_用户bos_mixtral2_V8_x5xt",
73 "effectMetric": {
74 "accuracy": 0,
75 "f1Score": 0,
76 "rouge_1": 0,
77 "rouge_2": 0,
78 "rouge_l": 0,
79 "bleu4": 0,
80 "avgJudgeScore": 0,
81 "stdJudgeScore": 0,
82 "medianJudgeScore": 0,
83 "scoreDistribution": null,
84 "manualAvgScore": 0.5,
85 "goodCaseProportion": 0,
86 "subjectiveImpression": "2",
87 "manualScoreDistribution": [
88 {
89 "dimension": "满意度",
90 "scoreDistribution": {
91 "-1": 2,
92 "1": 1
93 }
94 },
95 {
96 "dimension": "安全性",
97 "scoreDistribution": {
98 "-1": 2,
99 "0": 1
100 }
101 }
102 ]
103 },
104 "performanceMetric": {}
105 }
106 ]
107}
1{
2 "log_id": "3617826755",
3 "result": [
4 {
5 "modelName": "llama2_7b_32k_z_sft",
6 "modelVersion": "1",
7 "modelVersionSource": "Train",
8 "evalMode": "manual",
9 "evaluationName": "cl_联调_模型评估_用户bos",
10 "id": "65eae1fb1xxx9ca97a1",
11 "modelVersionId": 833,
12 "modelId": 591,
13 "userId": 1,
14 "evaluationId": 401,
15 "modelForm": "model",
16 "modelIdStr": "am-dkxwxxxxjgw",
17 "modelVersionIdStr": "amv-7ab3xxxtspe1",
18 "evaluationIdStr": "ame-28zxxx2rn4",
19 "evalUnitId": "ameu-gpvzxxxs0n",
20 "inferDatasetId": "ds-p79kyxxx7sbk",
21 "inferDatasetName": "cl_联调_模型评估_用户bos_llama2_7b_32k_z_sft_V1_jmrr",
22 "effectMetric": {
23 "accuracy": 0,
24 "f1Score": 0,
25 "rouge_1": 0,
26 "rouge_2": 0,
27 "rouge_l": 0,
28 "bleu4": 0,
29 "avgJudgeScore": 0,
30 "stdJudgeScore": 0,
31 "medianJudgeScore": 0,
32 "scoreDistribution": null,
33 "manualAvgScore": 0.5,
34 "goodCaseProportion": 0,
35 "subjectiveImpression": "1",
36 "manualScoreDistribution": [
37 {
38 "dimension": "满意度",
39 "scoreDistribution": {
40 "-1": 2,
41 "1": 1
42 }
43 },
44 {
45 "dimension": "安全性",
46 "scoreDistribution": {
47 "-1": 2,
48 "0": 1
49 }
50 }
51 ]
52 },
53 "performanceMetric": {}
54 },
55 {
56 "modelName": "mixtral2",
57 "modelVersion": "8",
58 "modelVersionSource": "Train",
59 "evalMode": "manual",
60 "evaluationName": "cl_联调_模型评估_用户bos",
61 "id": "65eae45dxxxcab739",
62 "modelVersionId": 7xx,
63 "modelId": 545,
64 "userId": 1,
65 "evaluationId": 401,
66 "modelForm": "model",
67 "modelIdStr": "am-ktcxxx88z",
68 "modelVersionIdStr": "amv-g2acxxxg9v",
69 "evaluationIdStr": "ame-28zxxx2rn4",
70 "evalUnitId": "ameu-1uxpxxx8uc2",
71 "inferDatasetId": "ds-ba82xxxguh",
72 "inferDatasetName": "cl_联调_模型评估_用户bos_mixtral2_V8_x5xt",
73 "effectMetric": {
74 "accuracy": 0,
75 "f1Score": 0,
76 "rouge_1": 0,
77 "rouge_2": 0,
78 "rouge_l": 0,
79 "bleu4": 0,
80 "avgJudgeScore": 0,
81 "stdJudgeScore": 0,
82 "medianJudgeScore": 0,
83 "scoreDistribution": null,
84 "manualAvgScore": 0.5,
85 "goodCaseProportion": 0,
86 "subjectiveImpression": "2",
87 "manualScoreDistribution": [
88 {
89 "dimension": "满意度",
90 "scoreDistribution": {
91 "-1": 2,
92 "1": 1
93 }
94 },
95 {
96 "dimension": "安全性",
97 "scoreDistribution": {
98 "-1": 2,
99 "0": 1
100 }
101 }
102 ]
103 },
104 "performanceMetric": {}
105 }
106 ]
107}
1{
2 "log_id": "3617826755",
3 "result": [
4 {
5 "modelName": "llama2_7b_32k_z_sft",
6 "modelVersion": "1",
7 "modelVersionSource": "Train",
8 "evalMode": "manual",
9 "evaluationName": "cl_联调_模型评估_用户bos",
10 "id": "65eae1fb1xxx9ca97a1",
11 "modelVersionId": 833,
12 "modelId": 591,
13 "userId": 1,
14 "evaluationId": 401,
15 "modelForm": "model",
16 "modelIdStr": "am-dkxwxxxxjgw",
17 "modelVersionIdStr": "amv-7ab3xxxtspe1",
18 "evaluationIdStr": "ame-28zxxx2rn4",
19 "evalUnitId": "ameu-gpvzxxxs0n",
20 "inferDatasetId": "ds-p79kyxxx7sbk",
21 "inferDatasetName": "cl_联调_模型评估_用户bos_llama2_7b_32k_z_sft_V1_jmrr",
22 "effectMetric": {
23 "accuracy": 0,
24 "f1Score": 0,
25 "rouge_1": 0,
26 "rouge_2": 0,
27 "rouge_l": 0,
28 "bleu4": 0,
29 "avgJudgeScore": 0,
30 "stdJudgeScore": 0,
31 "medianJudgeScore": 0,
32 "scoreDistribution": null,
33 "manualAvgScore": 0.5,
34 "goodCaseProportion": 0,
35 "subjectiveImpression": "1",
36 "manualScoreDistribution": [
37 {
38 "dimension": "满意度",
39 "scoreDistribution": {
40 "-1": 2,
41 "1": 1
42 }
43 },
44 {
45 "dimension": "安全性",
46 "scoreDistribution": {
47 "-1": 2,
48 "0": 1
49 }
50 }
51 ]
52 },
53 "performanceMetric": {}
54 },
55 {
56 "modelName": "mixtral2",
57 "modelVersion": "8",
58 "modelVersionSource": "Train",
59 "evalMode": "manual",
60 "evaluationName": "cl_联调_模型评估_用户bos",
61 "id": "65eae45dxxxcab739",
62 "modelVersionId": 7xx,
63 "modelId": 545,
64 "userId": 1,
65 "evaluationId": 401,
66 "modelForm": "model",
67 "modelIdStr": "am-ktcxxx88z",
68 "modelVersionIdStr": "amv-g2acxxxg9v",
69 "evaluationIdStr": "ame-28zxxx2rn4",
70 "evalUnitId": "ameu-1uxpxxx8uc2",
71 "inferDatasetId": "ds-ba82xxxguh",
72 "inferDatasetName": "cl_联调_模型评估_用户bos_mixtral2_V8_x5xt",
73 "effectMetric": {
74 "accuracy": 0,
75 "f1Score": 0,
76 "rouge_1": 0,
77 "rouge_2": 0,
78 "rouge_l": 0,
79 "bleu4": 0,
80 "avgJudgeScore": 0,
81 "stdJudgeScore": 0,
82 "medianJudgeScore": 0,
83 "scoreDistribution": null,
84 "manualAvgScore": 0.5,
85 "goodCaseProportion": 0,
86 "subjectiveImpression": "2",
87 "manualScoreDistribution": [
88 {
89 "dimension": "满意度",
90 "scoreDistribution": {
91 "-1": 2,
92 "1": 1
93 }
94 },
95 {
96 "dimension": "安全性",
97 "scoreDistribution": {
98 "-1": 2,
99 "0": 1
100 }
101 }
102 ]
103 },
104 "performanceMetric": {}
105 }
106 ]
107}
1{
2 log_id: '3617826755',
3 result: [
4 {
5 modelName: 'llama2_7b_32k_z_sft',
6 modelVersion: '1',
7 modelVersionSource: 'Train',
8 evalMode: 'manual',
9 evaluationName: 'cl_联调_模型评估_用户bos',
10 id: '65eae1fb1xxx9ca97a1',
11 modelVersionId: 833,
12 modelId: 591,
13 userId: 1,
14 evaluationId: 401,
15 modelForm: 'model',
16 modelIdStr: 'am-dkxwxxxxjgw',
17 modelVersionIdStr: 'amv-7ab3xxxtspe1',
18 evaluationIdStr: 'ame-28zxxx2rn4',
19 evalUnitId: 'ameu-gpvzxxxs0n',
20 inferDatasetId: 'ds-p79kyxxx7sbk',
21 inferDatasetName: 'cl_联调_模型评估_用户bos_llama2_7b_32k_z_sft_V1_jmrr',
22 effectMetric: {
23 accuracy: 0,
24 f1Score: 0,
25 rouge_1: 0,
26 rouge_2: 0,
27 rouge_l: 0,
28 bleu4: 0,
29 avgJudgeScore: 0,
30 stdJudgeScore: 0,
31 medianJudgeScore: 0,
32 scoreDistribution: null,
33 manualAvgScore: 0.5,
34 goodCaseProportion: 0,
35 subjectiveImpression: '1',
36 manualScoreDistribution: [
37 {
38 dimension: '满意度',
39 scoreDistribution: {
40 -1: 2,
41 1: 1
42 }
43 },
44 {
45 dimension: '安全性',
46 scoreDistribution: {
47 -1: 2,
48 0: 1
49 }
50 }
51 ]
52 },
53 performanceMetric: {}
54 },
55 {
56 modelName: 'mixtral2',
57 modelVersion: '8',
58 modelVersionSource: 'Train',
59 evalMode: 'manual',
60 evaluationName: 'cl_联调_模型评估_用户bos',
61 id: '65eae45dxxxcab739',
62 modelVersionId: 7xx,
63 modelId: 545,
64 userId: 1,
65 evaluationId: 401,
66 modelForm: 'model',
67 modelIdStr: 'am-ktcxxx88z',
68 modelVersionIdStr: 'amv-g2acxxxg9v',
69 evaluationIdStr: 'ame-28zxxx2rn4',
70 evalUnitId: 'ameu-1uxpxxx8uc2',
71 inferDatasetId: 'ds-ba82xxxguh',
72 inferDatasetName: 'cl_联调_模型评估_用户bos_mixtral2_V8_x5xt',
73 effectMetric: {
74 accuracy: 0,
75 f1Score: 0,
76 rouge_1: 0,
77 rouge_2: 0,
78 rouge_l: 0,
79 bleu4: 0,
80 avgJudgeScore: 0,
81 stdJudgeScore: 0,
82 medianJudgeScore: 0,
83 scoreDistribution: null,
84 manualAvgScore: 0.5,
85 goodCaseProportion: 0,
86 subjectiveImpression: '2',
87 manualScoreDistribution: [
88 {
89 dimension: '满意度',
90 scoreDistribution: {
91 -1: 2,
92 1: 1
93 }
94 },
95 {
96 dimension: '安全性',
97 scoreDistribution: {
98 -1: 2,
99 0: 1
100 }
101 }
102 ]
103 },
104 performanceMetric: {}
105 }
106 ]
107}
请求参数
名称 | 类型 | 必填 | 描述 |
---|---|---|---|
id | string | 是 | 评估任务ID,示例:ame-vwgs2ybhyhfv,说明: (1)可以通过以下方式获取该字段值: · 方式一:通过调用创建模型评估任务接口,返回的字段evalIdStr获取 · 方式二:在控制台-模型评估页面,点击某评估任务名称打开详情页,在任务详情的基本信息中查看,如下图所示 ![]() (2)该字段新增支持string类型,如果之前使用的是int类型,建议变更为string类型,后续可能将逐步废弃int类型;例如之前是通过调用创建模型评估任务接口,返回的字段evalId获取,建议替换为返回的evalIdStr获取 |
返回参数
名称 | 类型 | 描述 |
---|---|---|
log_id | string | 请求ID |
result | List<object> | 请求结果 |
result说明
名称 | 类型 | 描述 |
---|---|---|
evaluationId | int | 评估任务ID |
evaluationName | string | 评估任务名称 |
modelId | int | 模型ID |
modelVersionId | int | 模型版本ID |
modelName | string | 模型名 |
modelVersion | string | 模型版本号 |
modelVersionSource | string | 模型版本来源 |
evalMode | string | 评估模式,说明: (1)有以下评估模式 : · rule:基于规则 · model:裁判员模型 · manual:人工评估 (2)多个模式使用,拼接,示例“model,manual,rule” |
effectMetric | object | 效果指标 |
modelForm | string | 评估的物料类型,说明: · model:旧数据(推理结果集评估功能上线前的评估任务)类型都是模型,即值为model · inferDataset:推理结果集 |
modelIdStr | string | 模型字符串ID |
modelVersionIdStr | string | 模型版本字符串ID |
evaluationIdStr | string | 评估任务字符串ID |
evalUnitId | string | 评估子任务ID,用于唯一标识评估子任务 |
inferDatasetId | string | 当前评估子任务使用的推理结果集ID |
inferDatasetName | string | 当前评估子任务使用的推理结果集名称 |
effectMetric说明
名称 | 类型 | 描述 |
---|---|---|
id | string | 单个评估报告的主键 |
accuracy | number | 基于规则-准确率打分 |
f1Score | number | 基于规则-准确率打分 |
rouge_1 | number | 基于规则-相似度打分 |
rouge_2 | number | 基于规则-相似度打分 |
rouge_l | number | 基于规则-相似度打分 |
bleu4 | number | 基于规则-相似度打分 |
avgJudgeScore | number | 裁判员打分-均值 |
stdJudgeScore | number | 裁判员打分-标准差 |
medianJudgeScore | number | 裁判员打分-中位数 |
scoreDistribution | map[string]int | 裁判员打分-分值分布,说明: · 含有从最小值到最大值的所有分数 · -1为无效打分 |
manualAvgScore | number | 平均分 |
goodCaseProportion | int | Good case占比 |
subjectiveImpression | string | 人工打分-主观印象 |
manualScoreDistribution | List<object> | 维度分数分布 |
manualScoreDistribution说明
名称 | 类型 | 描述 |
---|---|---|
dimension | string | 评价维度 |
scoreDistribution | map[string]int | 维度分值分布,key为分值,value为分值的个数 |