查看模型评估任务报告
更新时间:2024-12-04
功能介绍
用于根据评估任务id,查看模型评估任务报告。
使用说明
本文API支持通过Go SDK、Java SDK 和 Node.js SDK调用,调用流程请参考SDK安装及使用流程。
SDK调用
调用示例
import os
from qianfan import resources
# 通过环境变量初始化认证信息
# 使用安全认证AK/SK调用,替换下列示例中参数,安全认证Access Key替换your_iam_ak,Secret Key替换your_iam_sk,如何获取请查看https://cloud.baidu.com/doc/Reference/s/9jwvz2egb
os.environ["QIANFAN_ACCESS_KEY"] = "your_iam_ak"
os.environ["QIANFAN_SECRET_KEY"] = "your_iam_sk"
resp = resources.console.utils.call_action(
# 调用本文API,该参数值为固定值,无需修改;对应本文HTTP调用-请求说明-请求地址的后缀
"/v2/eval",
# 调用本文API,该参数值为固定值,无需修改;对应本文HTTP调用-请求说明-请求参数-Query参数的Action
"DescribeEvalTaskReport",
# 对应本文HTTP调用-请求说明-请求参数-Body参数,具体使用请查看Body参数说明,根据实际使用选择参数
{"taskId": "ame-4kvnxxxxx"}
)
print(resp.body)
package main
import (
"context"
"fmt"
"os"
"github.com/baidubce/bce-qianfan-sdk/go/qianfan"
)
func main() {
// 使用安全认证AK/SK鉴权,通过环境变量初始化;替换下列示例中参数,安全认证Access Key替换your_iam_ak,Secret Key替换your_iam_sk
os.Setenv("QIANFAN_ACCESS_KEY", "your_iam_ak")
os.Setenv("QIANFAN_SECRET_KEY", "your_iam_sk")
ca := qianfan.NewConsoleAction()
res, err := ca.Call(context.TODO(),
// 调用本文API,该参数值为固定值,无需修改;对应本文HTTP调用-请求说明-请求地址的后缀
"/v2/eval",
// 调用本文API,该参数值为固定值,无需修改;对应本文HTTP调用-请求说明-请求参数-Query参数的Action
"DescribeEvalTaskReport",
// 对应本文HTTP调用-请求说明-请求参数-Body参数,具体使用请查看Body参数说明,根据实际使用选择参数
map[string]interface{}{
"taskId": "ame-4kvnxxx",
})
if err != nil {
panic(err)
}
fmt.Println(string(res.Body))
}
import com.baidubce.qianfan.Qianfan;
import com.baidubce.qianfan.model.console.ConsoleResponse;
import com.baidubce.qianfan.util.CollUtils;
import com.baidubce.qianfan.util.Json;
import java.util.Map;
public class Dome {
public static void main(String args[]){
// 使用安全认证AK/SK鉴权,替换下列示例中参数,安全认证Access Key替换your_iam_ak,Secret Key替换your_iam_sk
Qianfan qianfan = new Qianfan("your_iam_ak", "your_iam_sk");
ConsoleResponse<Map<String, Object>> response = qianfan.console()
// 调用本文API,该参数值为固定值,无需修改;对应本文HTTP调用-请求说明-请求地址的后缀
.route("/v2/eval")
// 调用本文API,该参数值为固定值,无需修改;对应本文HTTP调用-请求说明-请求参数-Query参数的Action
.action("DescribeEvalTaskReport")
// 需要传入参数的场景,可以自行封装请求类,或者使用Map.of()来构建请求Body
// Java 8可以使用SDK提供的CollUtils.mapOf()来替代Map.of()
// 对应本文HTTP调用-请求说明-请求参数-Body参数,具体使用请查看Body参数说明,根据实际使用选择参数
.body(CollUtils.mapOf(
"taskId", "ame-4kvnxxx"
))
.execute();
System.out.println(Json.serialize(response));
}
}
import {consoleAction, setEnvVariable} from "@baiducloud/qianfan";
// 使用安全认证AK/SK鉴权,通过环境变量初始化;替换下列示例中参数,安全认证Access Key替换your_iam_ak,Secret Key替换your_iam_sk
setEnvVariable('QIANFAN_ACCESS_KEY','your_iam_ak');
setEnvVariable('QIANFAN_SECRET_KEY','your_iam_sk');
async function main() {
//base_api_route:调用本文API,该参数值为固定值,无需修改;对应本文HTTP调用-请求说明-请求地址的后缀
//action:调用本文API,该参数值为固定值,无需修改;对应本文HTTP调用-请求说明-请求参数-Query参数的Action
//data:对应本文HTTP调用-请求说明-请求参数-Body参数,具体使用请查看Body参数说明,根据实际使用选择参数
const res = await consoleAction({base_api_route: '/v2/eval', action: 'DescribeEvalTaskReport', data: {
"taskId": "ame-4kvnxxx"
});
console.log(res);
}
main();
返回示例
{
'requestId': 'd60a00c4-a724-4851-96e5-b4dc3b258ca0',
'result': [
{
'taskId': 'ame-4kvnxxxx',
'taskName': '自动评估_停止测试0910',
'modelId': 'amv-tts8v6re61hp',
'inferDatasetId': 'ds-ecwqqjb787dk1vm6',
'evalObjectType': 'service',
'evalMode': 'rule',
'effectMetric': {
'accuracy': 0,
'f1Score': 0.34983957,
'rouge_1': 0.33882716,
'rouge_2': 0.15241386,
'rouge_l': 0.26100817,
'bleu4': 0.09671887,
'avgJudgeScore': 0,
'stdJudgeScore': 0,
'medianJudgeScore': 0,
'scoreDistribution': null,
'manualAvgScore': 0,
'goodCaseProportion': 0,
'subjectiveImpression': '',
'manualScoreDistribution': null,
'gsbDistribution': null
}
},
{
'taskId': 'ame-4kvnxxxx',
'taskName': '自动评估_停止测试0910',
'modelId': 'amv-6j6is3sp166h',
'inferDatasetId': 'ds-sueg3fqnd14h9kqt',
'evalObjectType': 'service',
'evalMode': 'rule',
'effectMetric': {
'accuracy': 0,
'f1Score': 0.34691638,
'rouge_1': 0.32689363,
'rouge_2': 0.13487022,
'rouge_l': 0.25140443,
'bleu4': 0.087691635,
'edit_dist': 331.97778,
'embedding_dist': 0.16930991,
'avgJudgeScore': 0,
'stdJudgeScore': 0,
'medianJudgeScore': 0,
'scoreDistribution': null,
'manualAvgScore': 0,
'goodCaseProportion': 0,
'subjectiveImpression': '',
'manualScoreDistribution': null,
'gsbDistribution': null
}
}
]
}
{
"requestId": "d60a00c4-a724-4851-96e5-b4dc3b258ca0",
"result": [
{
"taskId": "ame-4kvnxxxx",
"taskName": "自动评估_停止测试0910",
"modelId": "amv-tts8v6re61hp",
"inferDatasetId": "ds-ecwqqjb787dk1vm6",
"evalObjectType": "service",
"evalMode": "rule",
"effectMetric": {
"accuracy": 0,
"f1Score": 0.34983957,
"rouge_1": 0.33882716,
"rouge_2": 0.15241386,
"rouge_l": 0.26100817,
"bleu4": 0.09671887,
"avgJudgeScore": 0,
"stdJudgeScore": 0,
"medianJudgeScore": 0,
"scoreDistribution": null,
"manualAvgScore": 0,
"goodCaseProportion": 0,
"subjectiveImpression": "",
"manualScoreDistribution": null,
"gsbDistribution": null
}
},
{
"taskId": "ame-4kvnxxxx",
"taskName": "自动评估_停止测试0910",
"modelId": "amv-6j6is3sp166h",
"inferDatasetId": "ds-sueg3fqnd14h9kqt",
"evalObjectType": "service",
"evalMode": "rule",
"effectMetric": {
"accuracy": 0,
"f1Score": 0.34691638,
"rouge_1": 0.32689363,
"rouge_2": 0.13487022,
"rouge_l": 0.25140443,
"bleu4": 0.087691635,
"edit_dist": 331.97778,
"embedding_dist": 0.16930991,
"avgJudgeScore": 0,
"stdJudgeScore": 0,
"medianJudgeScore": 0,
"scoreDistribution": null,
"manualAvgScore": 0,
"goodCaseProportion": 0,
"subjectiveImpression": "",
"manualScoreDistribution": null,
"gsbDistribution": null
}
}
]
}
{
"requestId": "d60a00c4-a724-4851-96e5-b4dc3b258ca0",
"result": [
{
"taskId": "ame-4kvnxxxx",
"taskName": "自动评估_停止测试0910",
"modelId": "amv-tts8v6re61hp",
"inferDatasetId": "ds-ecwqqjb787dk1vm6",
"evalObjectType": "service",
"evalMode": "rule",
"effectMetric": {
"accuracy": 0,
"f1Score": 0.34983957,
"rouge_1": 0.33882716,
"rouge_2": 0.15241386,
"rouge_l": 0.26100817,
"bleu4": 0.09671887,
"avgJudgeScore": 0,
"stdJudgeScore": 0,
"medianJudgeScore": 0,
"scoreDistribution": null,
"manualAvgScore": 0,
"goodCaseProportion": 0,
"subjectiveImpression": "",
"manualScoreDistribution": null,
"gsbDistribution": null
}
},
{
"taskId": "ame-4kvnxxxx",
"taskName": "自动评估_停止测试0910",
"modelId": "amv-6j6is3sp166h",
"inferDatasetId": "ds-sueg3fqnd14h9kqt",
"evalObjectType": "service",
"evalMode": "rule",
"effectMetric": {
"accuracy": 0,
"f1Score": 0.34691638,
"rouge_1": 0.32689363,
"rouge_2": 0.13487022,
"rouge_l": 0.25140443,
"bleu4": 0.087691635,
"edit_dist": 331.97778,
"embedding_dist": 0.16930991,
"avgJudgeScore": 0,
"stdJudgeScore": 0,
"medianJudgeScore": 0,
"scoreDistribution": null,
"manualAvgScore": 0,
"goodCaseProportion": 0,
"subjectiveImpression": "",
"manualScoreDistribution": null,
"gsbDistribution": null
}
}
]
}
{
requestId: 'd60a00c4-a724-4851-96e5-b4dc3b258ca0',
result: [
{
taskId: 'ame-4kvnxxxx',
taskName: '自动评估_停止测试0910',
modelId: 'amv-tts8v6re61hp',
inferDatasetId: 'ds-ecwqqjb787dk1vm6',
evalObjectType: 'service',
evalMode: 'rule',
effectMetric: {
accuracy: 0,
f1Score: 0.34983957,
rouge_1: 0.33882716,
rouge_2: 0.15241386,
rouge_l: 0.26100817,
bleu4: 0.09671887,
avgJudgeScore: 0,
stdJudgeScore: 0,
medianJudgeScore: 0,
scoreDistribution: null,
manualAvgScore: 0,
goodCaseProportion: 0,
subjectiveImpression: '',
manualScoreDistribution: null,
gsbDistribution: null
}
},
{
taskId: 'ame-4kvnxxxx',
taskName: '自动评估_停止测试0910',
modelId: 'amv-6j6is3sp166h',
inferDatasetId: 'ds-sueg3fqnd14h9kqt',
evalObjectType: 'service',
evalMode: 'rule',
effectMetric: {
accuracy: 0,
f1Score: 0.34691638,
rouge_1: 0.32689363,
rouge_2: 0.13487022,
rouge_l: 0.25140443,
bleu4: 0.087691635,
edit_dist: 331.97778,
embedding_dist: 0.16930991,
avgJudgeScore: 0,
stdJudgeScore: 0,
medianJudgeScore: 0,
scoreDistribution: null,
manualAvgScore: 0,
goodCaseProportion: 0,
subjectiveImpression: '',
manualScoreDistribution: null,
gsbDistribution: null
}
}
]
}
请求参数
名称 | 类型 | 必填 | 描述 |
---|---|---|---|
taskId | string | 是 | 评估任务id,说明: (1)可以通过以下方式获取该字段值: · 方式一,通过调用创建模型评估任务接口,返回的字段result获取 · 方式二,在控制台-模型调优-模型评估,点击某评估任务名称打开详情页,在任务详情的基本信息中查看,如下图所示 |
返回参数
名称 | 类型 | 描述 |
---|---|---|
requestId | string | 请求ID |
code | string | 错误码,错误时返回 |
message | string | 错误信息,请求失败时返回 |
result | List<object> | 请求结果,请求成功时返回 |
result说明
名称 | 类型 | 描述 |
---|---|---|
taskId | string | 评估任务ID |
taskName | string | 评估任务名称 |
modelId | string | 模型版本ID |
inferDatasetId | string | 当前评估子任务使用的推理结果集id |
evalObjectType | string | 评估的数据类型,说明: · model:模型推理 · inferDataset:推理结果集 |
evalMode | string | 评估模式,说明: 具体值如下 : · rule:基于规则 · model:裁判员模型 · manual:人工评估 · rule,model:同时支持自动规则和自动裁判员评估 |
effectMetric | object | 效果指标 |
effectMetric说明
名称 | 类型 | 描述 |
---|---|---|
accuracy | number | 基于规则-准确率打分 |
f1Score | number | 基于规则-准确率打分 |
rouge_1 | number | 基于规则-相似度打分 |
rouge_2 | number | 基于规则-相似度打分 |
rouge_l | number | 基于规则-相似度打分 |
bleu4 | number | 基于规则-相似度打分 |
avgJudgeScore | number | 裁判员打分-均值 |
stdJudgeScore | number | 裁判员打分-标准差 |
medianJudgeScore | number | 裁判员打分-中位数 |
scoreDistribution | map[string]int | 裁判员打分-分值分布,说明: (1)含有从最小值到最大值的所有分数 (2)-1为无效打分 |
manualAvgScore | number | 平均分 |
goodCaseProportion | number | good case占比 |
subjectiveImpression | string | 人工打分-主观印象 |
manualScoreDistribution | object[] | 维度分数分布 |
gsbDistribution | map[string]int | gsb打分分布 |
manualScoreDistribution说明
名称 | 类型 | 描述 |
---|---|---|
dimension | string | 评价维度 |
scoreDistribution | map[string]int | 维度分值分布,key为分值,value为分值的个数 |