查看模型评估报告
更新时间:2024-11-08
功能介绍
用于获取评估报告(整体指标)。
使用说明
本文API支持通过Python SDK、Go SDK、Java SDK 和 Node.js SDK调用,调用流程请参考SDK安装及使用流程。
SDK调用
调用示例
import os
from qianfan import resources
# 使用安全认证AK/SK鉴权,通过环境变量方式初始化;替换下列示例中参数,安全认证Access Key替换your_iam_ak,Secret Key替换your_iam_sk
os.environ["QIANFAN_ACCESS_KEY"] = "your_iam_ak"
os.environ["QIANFAN_SECRET_KEY"] = "your_iam_sk"
resp = resources.Model.get_evaluation_result(eval_id=220)
print(resp)
package main
import (
"context"
"fmt"
"os"
"github.com/baidubce/bce-qianfan-sdk/go/qianfan"
)
func main() {
// 使用安全认证AK/SK鉴权,通过环境变量初始化;替换下列示例中参数,安全认证Access Key替换your_iam_ak,Secret Key替换your_iam_sk
os.Setenv("QIANFAN_ACCESS_KEY", "your_iam_ak")
os.Setenv("QIANFAN_SECRET_KEY", "your_iam_sk")
ca := qianfan.NewConsoleAction()
res, err := ca.Call(context.TODO(),
// 调用本文API,该参数值为固定值,无需修改;对应本文HTTP调用-请求说明-请求地址的后缀
"/wenxinworkshop/modelrepo/eval/report", "",
// 对应本文HTTP调用-请求说明-请求参数-Body参数,具体使用请查看Body参数说明,根据实际使用选择参数
map[string]interface{}{
"id":"ame-vwgs2ybhyhfv",
})
if err != nil {
panic(err)
}
fmt.Println(string(res.Body))
}
import com.baidubce.qianfan.Qianfan;
import com.baidubce.qianfan.model.console.ConsoleResponse;
import com.baidubce.qianfan.util.CollUtils;
import com.baidubce.qianfan.util.Json;
import java.util.Map;
public class Dome {
public static void main(String args[]){
// 使用安全认证AK/SK鉴权,替换下列示例中参数,安全认证Access Key替换your_iam_ak,Secret Key替换your_iam_sk
Qianfan qianfan = new Qianfan("your_iam_ak", "your_iam_sk");
ConsoleResponse<List<Map<String, Object>>> response = qianfan.console()
// 调用本文API,该参数值为固定值,无需修改;对应本文HTTP调用-请求说明-请求地址的后缀
.route("/wenxinworkshop/modelrepo/eval/report")
// 需要传入参数的场景,可以自行封装请求类,或者使用Map.of()来构建请求Body
// Java 8可以使用SDK提供的CollUtils.mapOf()来替代Map.of()
// 对应本文HTTP调用-请求说明-请求参数-Body参数,具体使用请查看Body参数说明,根据实际使用选择参数
.body(CollUtils.mapOf(
"id","ame-vwgs2ybhyhfv"
))
.execute(new TypeRef<List<Map<String, Object>>>() {});
System.out.println(Json.serialize(response));
}
}
import {consoleAction, setEnvVariable} from "@baiducloud/qianfan";
// 使用安全认证AK/SK鉴权,通过环境变量初始化;替换下列示例中参数,安全认证Access Key替换your_iam_ak,Secret Key替换your_iam_sk
setEnvVariable('QIANFAN_ACCESS_KEY','your_iam_ak');
setEnvVariable('QIANFAN_SECRET_KEY','your_iam_sk');
async function main() {
//base_api_route:调用本文API,该参数值为固定值,无需修改;对应本文HTTP调用-请求说明-请求地址的后缀
//data:对应本文HTTP调用-请求说明-请求参数-Body参数,具体使用请查看Body参数说明,根据实际使用选择参数
const res = await consoleAction({base_api_route: '/wenxinworkshop/modelrepo/eval/report', data: {
"id":"ame-vwgs2ybhyhfv"
}
});
console.log(res);
}
main();
返回示例
QfResponse(code=200,
headers={...},
body={
'log_id':'111',
'result': [{
'effectMetric': {
'accuracy': 0,
'avgJudgeScore': 0,
'bleu4': 0,
'f1Score': 0,
'goodCaseProportion': 0,
'manualAvgScore': 0,
'manualScoreDistribution': None,
'medianJudgeScore': 0,
'rouge_1': 0,
'rouge_2': 0,
'rouge_l': 0,
'scoreDistribution': None,
'stdJudgeScore': 0,
'subjectiveImpression': ''
},
'evalMode': 'manual',
'evaluationId': 220,
'evaluationName': 'xxx',
'modelId': 1111,
'modelName': 'm_17320_9098',
'modelVersion': '1',
'modelVersionId': 111,
'modelVersionSource': 'Train',
'modelForm': 'model',
'modelIdStr': 'am-dkxwxxxxjgw',
'modelVersionIdStr': 'amv-7ab3xxxtspe1',
'evaluationIdStr': 'ame-28zxxx2rn4',
'evalUnitId': 'ameu-gpvzxxxs0n',
'inferDatasetId': 'ds-p79kyxxx7sbk',
'inferDatasetName': 'cl_联调_模型评估_用户bos_llama2_7b_32k_z_sft_V1_jmrr'
}]
})
{
"log_id": "3617826755",
"result": [
{
"modelName": "llama2_7b_32k_z_sft",
"modelVersion": "1",
"modelVersionSource": "Train",
"evalMode": "manual",
"evaluationName": "cl_联调_模型评估_用户bos",
"id": "65eae1fb1xxx9ca97a1",
"modelVersionId": 833,
"modelId": 591,
"userId": 1,
"evaluationId": 401,
"modelForm": "model",
"modelIdStr": "am-dkxwxxxxjgw",
"modelVersionIdStr": "amv-7ab3xxxtspe1",
"evaluationIdStr": "ame-28zxxx2rn4",
"evalUnitId": "ameu-gpvzxxxs0n",
"inferDatasetId": "ds-p79kyxxx7sbk",
"inferDatasetName": "cl_联调_模型评估_用户bos_llama2_7b_32k_z_sft_V1_jmrr",
"effectMetric": {
"accuracy": 0,
"f1Score": 0,
"rouge_1": 0,
"rouge_2": 0,
"rouge_l": 0,
"bleu4": 0,
"avgJudgeScore": 0,
"stdJudgeScore": 0,
"medianJudgeScore": 0,
"scoreDistribution": null,
"manualAvgScore": 0.5,
"goodCaseProportion": 0,
"subjectiveImpression": "1",
"manualScoreDistribution": [
{
"dimension": "满意度",
"scoreDistribution": {
"-1": 2,
"1": 1
}
},
{
"dimension": "安全性",
"scoreDistribution": {
"-1": 2,
"0": 1
}
}
]
},
"performanceMetric": {}
},
{
"modelName": "mixtral2",
"modelVersion": "8",
"modelVersionSource": "Train",
"evalMode": "manual",
"evaluationName": "cl_联调_模型评估_用户bos",
"id": "65eae45dxxxcab739",
"modelVersionId": 7xx,
"modelId": 545,
"userId": 1,
"evaluationId": 401,
"modelForm": "model",
"modelIdStr": "am-ktcxxx88z",
"modelVersionIdStr": "amv-g2acxxxg9v",
"evaluationIdStr": "ame-28zxxx2rn4",
"evalUnitId": "ameu-1uxpxxx8uc2",
"inferDatasetId": "ds-ba82xxxguh",
"inferDatasetName": "cl_联调_模型评估_用户bos_mixtral2_V8_x5xt",
"effectMetric": {
"accuracy": 0,
"f1Score": 0,
"rouge_1": 0,
"rouge_2": 0,
"rouge_l": 0,
"bleu4": 0,
"avgJudgeScore": 0,
"stdJudgeScore": 0,
"medianJudgeScore": 0,
"scoreDistribution": null,
"manualAvgScore": 0.5,
"goodCaseProportion": 0,
"subjectiveImpression": "2",
"manualScoreDistribution": [
{
"dimension": "满意度",
"scoreDistribution": {
"-1": 2,
"1": 1
}
},
{
"dimension": "安全性",
"scoreDistribution": {
"-1": 2,
"0": 1
}
}
]
},
"performanceMetric": {}
}
]
}
{
"log_id": "3617826755",
"result": [
{
"modelName": "llama2_7b_32k_z_sft",
"modelVersion": "1",
"modelVersionSource": "Train",
"evalMode": "manual",
"evaluationName": "cl_联调_模型评估_用户bos",
"id": "65eae1fb1xxx9ca97a1",
"modelVersionId": 833,
"modelId": 591,
"userId": 1,
"evaluationId": 401,
"modelForm": "model",
"modelIdStr": "am-dkxwxxxxjgw",
"modelVersionIdStr": "amv-7ab3xxxtspe1",
"evaluationIdStr": "ame-28zxxx2rn4",
"evalUnitId": "ameu-gpvzxxxs0n",
"inferDatasetId": "ds-p79kyxxx7sbk",
"inferDatasetName": "cl_联调_模型评估_用户bos_llama2_7b_32k_z_sft_V1_jmrr",
"effectMetric": {
"accuracy": 0,
"f1Score": 0,
"rouge_1": 0,
"rouge_2": 0,
"rouge_l": 0,
"bleu4": 0,
"avgJudgeScore": 0,
"stdJudgeScore": 0,
"medianJudgeScore": 0,
"scoreDistribution": null,
"manualAvgScore": 0.5,
"goodCaseProportion": 0,
"subjectiveImpression": "1",
"manualScoreDistribution": [
{
"dimension": "满意度",
"scoreDistribution": {
"-1": 2,
"1": 1
}
},
{
"dimension": "安全性",
"scoreDistribution": {
"-1": 2,
"0": 1
}
}
]
},
"performanceMetric": {}
},
{
"modelName": "mixtral2",
"modelVersion": "8",
"modelVersionSource": "Train",
"evalMode": "manual",
"evaluationName": "cl_联调_模型评估_用户bos",
"id": "65eae45dxxxcab739",
"modelVersionId": 7xx,
"modelId": 545,
"userId": 1,
"evaluationId": 401,
"modelForm": "model",
"modelIdStr": "am-ktcxxx88z",
"modelVersionIdStr": "amv-g2acxxxg9v",
"evaluationIdStr": "ame-28zxxx2rn4",
"evalUnitId": "ameu-1uxpxxx8uc2",
"inferDatasetId": "ds-ba82xxxguh",
"inferDatasetName": "cl_联调_模型评估_用户bos_mixtral2_V8_x5xt",
"effectMetric": {
"accuracy": 0,
"f1Score": 0,
"rouge_1": 0,
"rouge_2": 0,
"rouge_l": 0,
"bleu4": 0,
"avgJudgeScore": 0,
"stdJudgeScore": 0,
"medianJudgeScore": 0,
"scoreDistribution": null,
"manualAvgScore": 0.5,
"goodCaseProportion": 0,
"subjectiveImpression": "2",
"manualScoreDistribution": [
{
"dimension": "满意度",
"scoreDistribution": {
"-1": 2,
"1": 1
}
},
{
"dimension": "安全性",
"scoreDistribution": {
"-1": 2,
"0": 1
}
}
]
},
"performanceMetric": {}
}
]
}
{
log_id: '3617826755',
result: [
{
modelName: 'llama2_7b_32k_z_sft',
modelVersion: '1',
modelVersionSource: 'Train',
evalMode: 'manual',
evaluationName: 'cl_联调_模型评估_用户bos',
id: '65eae1fb1xxx9ca97a1',
modelVersionId: 833,
modelId: 591,
userId: 1,
evaluationId: 401,
modelForm: 'model',
modelIdStr: 'am-dkxwxxxxjgw',
modelVersionIdStr: 'amv-7ab3xxxtspe1',
evaluationIdStr: 'ame-28zxxx2rn4',
evalUnitId: 'ameu-gpvzxxxs0n',
inferDatasetId: 'ds-p79kyxxx7sbk',
inferDatasetName: 'cl_联调_模型评估_用户bos_llama2_7b_32k_z_sft_V1_jmrr',
effectMetric: {
accuracy: 0,
f1Score: 0,
rouge_1: 0,
rouge_2: 0,
rouge_l: 0,
bleu4: 0,
avgJudgeScore: 0,
stdJudgeScore: 0,
medianJudgeScore: 0,
scoreDistribution: null,
manualAvgScore: 0.5,
goodCaseProportion: 0,
subjectiveImpression: '1',
manualScoreDistribution: [
{
dimension: '满意度',
scoreDistribution: {
-1: 2,
1: 1
}
},
{
dimension: '安全性',
scoreDistribution: {
-1: 2,
0: 1
}
}
]
},
performanceMetric: {}
},
{
modelName: 'mixtral2',
modelVersion: '8',
modelVersionSource: 'Train',
evalMode: 'manual',
evaluationName: 'cl_联调_模型评估_用户bos',
id: '65eae45dxxxcab739',
modelVersionId: 7xx,
modelId: 545,
userId: 1,
evaluationId: 401,
modelForm: 'model',
modelIdStr: 'am-ktcxxx88z',
modelVersionIdStr: 'amv-g2acxxxg9v',
evaluationIdStr: 'ame-28zxxx2rn4',
evalUnitId: 'ameu-1uxpxxx8uc2',
inferDatasetId: 'ds-ba82xxxguh',
inferDatasetName: 'cl_联调_模型评估_用户bos_mixtral2_V8_x5xt',
effectMetric: {
accuracy: 0,
f1Score: 0,
rouge_1: 0,
rouge_2: 0,
rouge_l: 0,
bleu4: 0,
avgJudgeScore: 0,
stdJudgeScore: 0,
medianJudgeScore: 0,
scoreDistribution: null,
manualAvgScore: 0.5,
goodCaseProportion: 0,
subjectiveImpression: '2',
manualScoreDistribution: [
{
dimension: '满意度',
scoreDistribution: {
-1: 2,
1: 1
}
},
{
dimension: '安全性',
scoreDistribution: {
-1: 2,
0: 1
}
}
]
},
performanceMetric: {}
}
]
}
请求参数
注意:不同语言SDK的请求参数不同,请根据实际调用选择对应参数。
- Python SDK请求参数说明
名称 | 类型 | 必填 | 描述 |
---|---|---|---|
eval_id | string | 是 | 评估任务id,示例:ame-vwgs2ybhyhfv,说明: (1)可以通过以下任一方式获取该字段值: · 方式一:通过调用创建模型评估任务接口,返回的字段evalIdStr获取 · 方式二:通过页面url地址获取,在控制台-模型评估页面,点击某评估任务名称打开详情页,在页面url地址中查看,如下图所示 (2)该字段新增支持string类型,如果之前使用的是int类型,建议变更为string类型,后续可能将逐步废弃int类型;例如之前是通过调用创建模型评估任务接口,返回的字段evalIdStr获取,建议替换为返回的evalIdStr获取 |
- 其它SDK请求参数说明
名称 | 类型 | 必填 | 描述 |
---|---|---|---|
id | string | 是 | 评估任务id,示例:ame-vwgs2ybhyhfv,说明: (1)可以通过以下方式获取该字段值: · 方式一,通过调用创建模型评估任务接口,返回的字段evalIdStr获取 · 方式二,在控制台-模型评估页面,点击某评估任务名称打开详情页,在任务详情的基本信息中查看,如下图所示 (2)该字段新增支持string类型,如果之前使用的是int类型,建议变更为string类型,后续可能将逐步废弃int类型;例如之前是通过调用创建模型评估任务接口,返回的字段evalId获取,建议替换为返回的evalIdStr获取 |
返回参数
名称 | 类型 | 描述 |
---|---|---|
log_id | string | 请求ID |
result | object[] | 请求结果 |
result说明
名称 | 类型 | 描述 |
---|---|---|
evaluationId | int | 评估任务ID |
evaluationName | string | 评估任务名称 |
modelId | int | 模型ID |
modelVersionId | int | 模型版本ID |
modelName | string | 模型名 |
modelVersion | string | 模型版本号 |
modelVersionSource | string | 模型版本来源 |
evalMode | string | 评估模式,说明: (1)有以下评估模式 : · rule:基于规则 · model:裁判员模型 · manual:人工评估 (2)多个模式使用,拼接,示例“model,manual,rule” |
effectMetric | object | 效果指标 |
modelForm | string | 评估的物料类型,说明: · model:旧数据(推理结果集评估功能上线前的评估任务)类型都是模型,即值为model · inferDataset:推理结果集 |
modelIdStr | string | 模型字符串id |
modelVersionIdStr | string | 模型版本字符串id |
evaluationIdStr | string | 评估任务字符串id |
evalUnitId | string | 评估子任务id,用于唯一标识评估子任务 |
inferDatasetId | string | 当前评估子任务使用的推理结果集id |
inferDatasetName | string | 当前评估子任务使用的推理结果集名称 |
effectMetric说明
名称 | 类型 | 描述 |
---|---|---|
id | string | 单个评估报告的主键 |
accuracy | number | 基于规则-准确率打分 |
f1Score | number | 基于规则-准确率打分 |
rouge_1 | number | 基于规则-相似度打分 |
rouge_2 | number | 基于规则-相似度打分 |
rouge_l | number | 基于规则-相似度打分 |
bleu4 | number | 基于规则-相似度打分 |
avgJudgeScore | number | 裁判员打分-均值 |
stdJudgeScore | number | 裁判员打分-标准差 |
medianJudgeScore | number | 裁判员打分-中位数 |
scoreDistribution | map[string]int | 裁判员打分-分值分布,说明: · 含有从最小值到最大值的所有分数 · -1为无效打分 |
manualAvgScore | number | 平均分 |
goodCaseProportion | int | good case占比 |
subjectiveImpression | string | 人工打分-主观印象 |
manualScoreDistribution | object[] | 维度分数分布 |
manualScoreDistribution说明
名称 | 类型 | 描述 |
---|---|---|
dimension | string | 评价维度 |
scoreDistribution | map[string]int | 维度分值分布,key为分值,value为分值的个数 |