查看清洗任务列表

更新时间：2025-01-17

功能介绍

本接口用于查看清洗任务列表。

注意事项

通过API查看清洗任务列表，和千帆控制台页面展示字段不同：

本文API参数有的字段，可能在千帆控制台页面无
千帆控制台页面的部分字段，可能在本文API参数中无
后续会持续完善API功能，请关注API文档更新

使用说明

本文API支持通过Python SDK、Go SDK、Java SDK 和 Node.js SDK调用，调用流程请参考SDK安装及使用流程。

SDK调用

调用示例

import os
from qianfan  import resources

# 通过环境变量初始化认证信息
# 使用安全认证AK/SK调用，替换下列示例中参数，安全认证Access Key替换your_iam_ak，Secret Key替换your_iam_sk，如何获取请查看https://cloud.baidu.com/doc/Reference/s/9jwvz2egb
os.environ["QIANFAN_ACCESS_KEY"] = "your_iam_ak"
os.environ["QIANFAN_SECRET_KEY"] = "your_iam_sk"



resp = resources.console.utils.call_action(
    # 调用本文API，该参数值为固定值，无需修改；对应API调用文档-请求结构-请求地址的后缀
    "/wenxinworkshop/etl/list", "", 
    # 请查看本文请求参数说明，根据实际使用选择参数；对应API调用文档-请求参数-Body参数
    {
        "offset": 0,
        "pageSize": 10
    }
    
)

print(resp.body)

package main
import (
    "context"
    "fmt"
    "os"
    "github.com/baidubce/bce-qianfan-sdk/go/qianfan"
)
func main() {
     // 使用安全认证AK/SK鉴权，通过环境变量初始化；替换下列示例中参数，安全认证Access Key替换your_iam_ak，Secret Key替换your_iam_sk
    os.Setenv("QIANFAN_ACCESS_KEY", "your_iam_ak")
    os.Setenv("QIANFAN_SECRET_KEY", "your_iam_sk")
    
    ca := qianfan.NewConsoleAction()
    
    res, err := ca.Call(context.TODO(),
    // 调用本文API，该参数值为固定值，无需修改；对应API调用文档-请求结构-请求地址的后缀
    "/wenxinworkshop/etl/list", "",
    // 请查看本文请求参数说明，根据实际使用选择参数；对应API调用文档-请求参数-Body参数
    map[string]interface{}{
                "offset": 0,
	            "pageSize": 10,
    })
    if err != nil {
        panic(err)
    }
    fmt.Println(string(res.Body))
    
}

import com.baidubce.qianfan.Qianfan;
import com.baidubce.qianfan.model.console.ConsoleResponse;
import com.baidubce.qianfan.util.CollUtils;
import com.baidubce.qianfan.util.Json;
import java.util.Map;

public class Dome {
    public static void main(String args[]){
        // 使用安全认证AK/SK鉴权，替换下列示例中参数，安全认证Access Key替换your_iam_ak，Secret Key替换your_iam_sk
        Qianfan qianfan = new Qianfan("your_iam_ak", "your_iam_sk");
        
        ConsoleResponse<Map<String, Object>> response = qianfan.console()
                // 调用本文API，该参数值为固定值，无需修改；对应API调用文档-请求结构-请求地址的后缀
                .route("/wenxinworkshop/etl/list")
                // 需要传入参数的场景，可以自行封装请求类，或者使用Map.of()来构建请求Body
                // Java 8可以使用SDK提供的CollUtils.mapOf()来替代Map.of()
                // 请查看本文请求参数说明，根据实际使用选择参数；对应API调用文档-请求参数-Body参数
                .body(CollUtils.mapOf(
                    "offset", 0,
	                "pageSize", 10
                ))
                .execute();

        System.out.println(Json.serialize(response));
    }
}

import {consoleAction, setEnvVariable} from "@baiducloud/qianfan";

// 使用安全认证AK/SK鉴权，通过环境变量初始化；替换下列示例中参数，安全认证Access Key替换your_iam_ak，Secret Key替换your_iam_sk
setEnvVariable('QIANFAN_ACCESS_KEY','your_iam_ak');
setEnvVariable('QIANFAN_SECRET_KEY','your_iam_sk');

async function main() {
  //base_api_route:调用本文API，该参数值为固定值，无需修改；对应API调用文档-请求结构-请求地址的后缀
  //data:请查看本文请求参数说明，根据实际使用选择参数；对应API调用文档-请求参数-Body参数
  const res = await consoleAction({base_api_route: '/wenxinworkshop/etl/list', data: {
        "offset": 0,
        "pageSize": 10
    }
  });    
    
  console.log(res);
}

main();

返回示例

{
    "log_id": "wwcm30w7exxexyqx",
    "result": {
        "processingCount": 0,
        "items": [
            {
                "etlId": 275,
                "etlStrId": "task-992515vjv503t94c",
                "startTime": "2023-11-06 16:03:23",
                "sourceDatasetName": "4train_generic_usrBos-V1",
                "destDatasetName": "4train_generic_sysBos-V1",
                "operatorNameList": [
                    "remove_invisible_character",
                    "replace_uniform_whitespace",
                    "remove_non_meaning_characters",
                    "replace_traditional_chinese_to_simplified",
                    "remove_web_identifiers",
                    "remove_emoji",
                    "deduplication_simhash",
                    "replace_emails",
                    "replace_ip",
                    "replace_identifier",
                    "filter_check_number_words",
                    "filter_check_character_repetition_removal",
                    "filter_check_word_repetition_removal",
                    "filter_check_special_characters",
                    "filter_check_flagged_words",
                    "filter_check_lang_id",
                    "filter_check_perplexity"
                ],
                "sourceDatasetId": 2235,
                "sourceDatasetStrId": "ds-xxafmaifn213d",
                "destDatasetId": 2230,
                "destDatasetStrId": "ds-1j3l12jddok12",
                "entityCount": 1,
                "entityType": 2,
                "result": {
                    "RET_OK": 0,
                    "pipeline_stage_result": null,
                    "export_entity_num": 0,
                    "remaining_entity": 0,
                    "unprocessed_entity": 0,
                    "remove_emoji": {
                        "processed_entity": 0
                    },
                    "remove_url": {
                        "processed_entity": 0
                    },
                    "trad_to_simp": {
                        "processed_entity": 0
                    },
                    "remove_id_card": {
                        "processed_entity": 0
                    },
                    "remove_phone_number": {
                        "processed_entity": 0
                    },
                    "remove_exception_char": {
                        "processed_entity": 0
                    },
                    "replace_sim2trad": {
                        "processed_entity": 0
                    },
                    "replace_trad2sim": {
                        "processed_entity": 0
                    },
                    "replace_upper2lower": {
                        "processed_entity": 0
                    },
                    "cut": {
                        "remaining_entity": 0,
                        "unprocessed_entity": 0
                    },
                    "failReason": "",
                    "pauseReason": ""
                },
                "processStatus": 4,
                "status": 0,
                "errCode": 0,
                "errMsg": "",
                "createTime": "0001-01-01T00:00:00Z",
                "finishTime": "0001-01-01T00:00:00Z",
                "modifyTime": "0001-01-01T00:00:00Z"
            }
        ],
        "total": 1
    },
    "status": 200,
    "success": True
}

{
    "log_id": "wwcm30w7exxexyqx",
    "result": {
        "processingCount": 0,
        "items": [
            {
                "etlId": 275,
                "etlStrId": "task-992515vjv503t94c",
                "startTime": "2023-11-06 16:03:23",
                "sourceDatasetName": "4train_generic_usrBos-V1",
                "destDatasetName": "4train_generic_sysBos-V1",
                "operatorNameList": [
                    "remove_invisible_character",
                    "replace_uniform_whitespace",
                    "remove_non_meaning_characters",
                    "replace_traditional_chinese_to_simplified",
                    "remove_web_identifiers",
                    "remove_emoji",
                    "deduplication_simhash",
                    "replace_emails",
                    "replace_ip",
                    "replace_identifier",
                    "filter_check_number_words",
                    "filter_check_character_repetition_removal",
                    "filter_check_word_repetition_removal",
                    "filter_check_special_characters",
                    "filter_check_flagged_words",
                    "filter_check_lang_id",
                    "filter_check_perplexity"
                ],
                "sourceDatasetId": 2235,
                "sourceDatasetStrId": "ds-xxafmaifn213d",
                "destDatasetId": 2230,
                "destDatasetStrId": "ds-1j3l12jddok12",
                "entityCount": 1,
                "entityType": 2,
                "result": {
                    "RET_OK": 0,
                    "pipeline_stage_result": null,
                    "export_entity_num": 0,
                    "remaining_entity": 0,
                    "unprocessed_entity": 0,
                    "remove_emoji": {
                        "processed_entity": 0
                    },
                    "remove_url": {
                        "processed_entity": 0
                    },
                    "trad_to_simp": {
                        "processed_entity": 0
                    },
                    "remove_id_card": {
                        "processed_entity": 0
                    },
                    "remove_phone_number": {
                        "processed_entity": 0
                    },
                    "remove_exception_char": {
                        "processed_entity": 0
                    },
                    "replace_sim2trad": {
                        "processed_entity": 0
                    },
                    "replace_trad2sim": {
                        "processed_entity": 0
                    },
                    "replace_upper2lower": {
                        "processed_entity": 0
                    },
                    "cut": {
                        "remaining_entity": 0,
                        "unprocessed_entity": 0
                    },
                    "failReason": "",
                    "pauseReason": ""
                },
                "processStatus": 4,
                "status": 0,
                "errCode": 0,
                "errMsg": "",
                "createTime": "0001-01-01T00:00:00Z",
                "finishTime": "0001-01-01T00:00:00Z",
                "modifyTime": "0001-01-01T00:00:00Z"
            }
        ],
        "total": 1
    },
    "status": 200,
    "success": true
}

{
    "log_id": "wwcm30w7exxexyqx",
    "result": {
        "processingCount": 0,
        "items": [
            {
                "etlId": 275,
                "etlStrId": "task-992515vjv503t94c",
                "startTime": "2023-11-06 16:03:23",
                "sourceDatasetName": "4train_generic_usrBos-V1",
                "destDatasetName": "4train_generic_sysBos-V1",
                "operatorNameList": [
                    "remove_invisible_character",
                    "replace_uniform_whitespace",
                    "remove_non_meaning_characters",
                    "replace_traditional_chinese_to_simplified",
                    "remove_web_identifiers",
                    "remove_emoji",
                    "deduplication_simhash",
                    "replace_emails",
                    "replace_ip",
                    "replace_identifier",
                    "filter_check_number_words",
                    "filter_check_character_repetition_removal",
                    "filter_check_word_repetition_removal",
                    "filter_check_special_characters",
                    "filter_check_flagged_words",
                    "filter_check_lang_id",
                    "filter_check_perplexity"
                ],
                "sourceDatasetId": 2235,
                "sourceDatasetStrId": "ds-xxafmaifn213d",
                "destDatasetId": 2230,
                "destDatasetStrId": "ds-1j3l12jddok12",
                "entityCount": 1,
                "entityType": 2,
                "result": {
                    "RET_OK": 0,
                    "pipeline_stage_result": null,
                    "export_entity_num": 0,
                    "remaining_entity": 0,
                    "unprocessed_entity": 0,
                    "remove_emoji": {
                        "processed_entity": 0
                    },
                    "remove_url": {
                        "processed_entity": 0
                    },
                    "trad_to_simp": {
                        "processed_entity": 0
                    },
                    "remove_id_card": {
                        "processed_entity": 0
                    },
                    "remove_phone_number": {
                        "processed_entity": 0
                    },
                    "remove_exception_char": {
                        "processed_entity": 0
                    },
                    "replace_sim2trad": {
                        "processed_entity": 0
                    },
                    "replace_trad2sim": {
                        "processed_entity": 0
                    },
                    "replace_upper2lower": {
                        "processed_entity": 0
                    },
                    "cut": {
                        "remaining_entity": 0,
                        "unprocessed_entity": 0
                    },
                    "failReason": "",
                    "pauseReason": ""
                },
                "processStatus": 4,
                "status": 0,
                "errCode": 0,
                "errMsg": "",
                "createTime": "0001-01-01T00:00:00Z",
                "finishTime": "0001-01-01T00:00:00Z",
                "modifyTime": "0001-01-01T00:00:00Z"
            }
        ],
        "total": 1
    },
    "status": 200,
    "success": true
}

{
    log_id: 'wwcm30w7exxexyqx',
    result: {
        processingCount: 0,
        items: [
            {
                etlId: 275,
                etlStrId: 'task-992515vjv503t94c',
                startTime: '2023-11-06 16:03:23',
                sourceDatasetName: '4train_generic_usrBos-V1',
                destDatasetName: '4train_generic_sysBos-V1',
                operatorNameList: [
                    "remove_invisible_character",
                    "replace_uniform_whitespace",
                    "remove_non_meaning_characters",
                    "replace_traditional_chinese_to_simplified",
                    "remove_web_identifiers",
                    "remove_emoji",
                    "deduplication_simhash",
                    "replace_emails",
                    "replace_ip",
                    "replace_identifier",
                    "filter_check_number_words",
                    "filter_check_character_repetition_removal",
                    "filter_check_word_repetition_removal",
                    "filter_check_special_characters",
                    "filter_check_flagged_words",
                    "filter_check_lang_id",
                    "filter_check_perplexity"
                ],
                sourceDatasetId: 2235,
                sourceDatasetStrId: 'ds-xxafmaifn213d',
                destDatasetId: 2230,
                destDatasetStrId: 'ds-1j3l12jddok12',
                entityCount: 1,
                entityType: 2,
                result: {
                    RET_OK: 0,
                    pipeline_stage_result: null,
                    export_entity_num: 0,
                    remaining_entity: 0,
                    unprocessed_entity: 0,
                    remove_emoji: {
                        processed_entity: 0
                    },
                    remove_url: {
                        processed_entity: 0
                    },
                    trad_to_simp: {
                        processed_entity: 0
                    },
                    remove_id_card: {
                        processed_entity: 0
                    },
                    remove_phone_number: {
                        processed_entity: 0
                    },
                    remove_exception_char: {
                        processed_entity: 0
                    },
                    replace_sim2trad: {
                        processed_entity: 0
                    },
                    replace_trad2sim: {
                        processed_entity: 0
                    },
                    replace_upper2lower: {
                        processed_entity: 0
                    },
                    cut: {
                        remaining_entity: 0,
                        unprocessed_entity: 0
                    },
                    failReason: '',
                    pauseReason: ''
                },
                processStatus: 4,
                status: 0,
                errCode: 0,
                errMsg: '',
                createTime: '0001-01-01T00:00:00Z',
                finishTime: '0001-01-01T00:00:00Z',
                modifyTime: '0001-01-01T00:00:00Z'
            }
        ],
        total: 1
    },
    status: 200,
    success: true
}

请求参数

名称	类型	必填	描述
offset	int	否	分页偏移，默认0
pageSize	int	是	分页大小，范围[1,20]

返回参数

说明：返回的部分字段如下，未说明的字段暂无需关注。

名称	类型	说明
log_id	string	操作记录id
result	object	返回结果
status	int	状态码
success	bool	是否操作成功 · true：成功 · false：失败

返回结果result说明

名称	类型	说明
processingCount	int	正在清洗的任务数
items	obejct[]	任务列表
total	int	任务总数

item说明

名称	类型	说明
etlId	int	清洗任务序号，注意：该字段后续将废弃，如果使用了此字段，建议变更为etlStrId字段
etlStrId	string	清洗任务序号
startTime	string	开始时间
sourceDatasetName	string	清洗前源数据集名称
destDatasetName	string	清洗后目标数据集名称
operatorNameList	string[]	清洗使用的算子，（1）Clean清洗阶段算子 · remove_emoji：去除文档中的表情 · remove_invisible_character：移除ASCII中的一些不可见字符, 如0-32 和127-160这两个范围 · replace_uniform_whitespace：将不同的unicode空格比如 u2008，转成正常的空格 · remove_non_meaning_characters：去除乱码和无意义的unicode · replace_traditional_chinese_to_simplified：繁体转简体，如“不經意，妳的笑容”清洗成“不经意，你的笑容” · remove_web_identifiers：移除文档中的html标签，如`<html>,<dev>,<p>`等（2）Filter过滤阶段算子 · filter_check_number_words：检查文档的词数目，词数目不在指定范围会被过滤掉，如中文[1,10000] · filter_check_word_repetition_removal：检查文档的词重复率，如果词重复率太高，意味着文档中重复的词太多，文档会被过滤掉 · filter_check_character_repetition_removal：检查文档的字重复率，如果字重复率太高，意味着文档中重复的字太多，文档会被过滤掉 · filter_check_special_characters：检查文档的特殊字符率，如果特殊字符率太高，意味着文档中特殊字符太多，文档会被过滤掉 ·filter_check_flagged_words：检查文档的色情暴力词率,如果色情暴力词率太高，文档会被过滤掉 · filter_check_lang_id：检查文档的语言概率,如果语言概率太低，文档会被过滤掉 · filter_check_perplexity：检查文档的困惑度,如果困惑度太高，文档会被过滤掉（3）Deduplication去重阶段算子 · deduplication_simhash：根据海明距离计算文档相似度, 相似度<=海明距离，认为两个文档相似。（4）Desensitization 去隐私阶段算子 · replace_emails：去除email地址 · replace_ip：去除IPv4 或者 IPv6 地址 · replace_identifier：去除数字和字母数字标识符，如电话号码、信用卡号、十六进制散列等，同时跳过年份和简单数字的实例
sourceDatasetId	int	源数据集版本id，注意：此字段后续将废弃，如果使用了此字段，建议变更为sourceDatasetStrId字段
sourceDatasetStrId	string	源数据集版本字符串id
destDatasetId	int	目标数据集版本id，注意：此字段后续将废弃，如果使用了此字段，建议变更为destDatasetStrId字段
destDatasetStrId	string	目标数据集版本字符串id
entityCount	int	样本数量
entityType	int	样本类型，说明： · 1：图片 · 2：文本
result	object	清洗结果
processStatus	int	清洗状态信息，说明： · 0：无状态，表示没有任务 · 1：运行中 · 2：已完成 · 3：任务终止 · 4：任务失败 · 5：任务暂停清洗状态信息
status	int	状态： · 0：正常 · 1：删除
errCode	int	未启用，是否清洗错误都为0
errMsg	string	清洗错误时返回"failed"
createTime	string	创建时间
finishTime	string	完成时间
modifyTime	string	更改时间

清洗结果result说明

名称	类型	描述
RET_OK	int	清洗结果
pipeline_stage_result	object	pipeline状态结果
export_entity_num	int	导出样本数量
remaining_entity	int	剩余样本
unprocessed_entity	int	尚未清洗样本
remove_emoji	object	里面只有一个int字段，processed_entity：某个算子被执行的行数
remove_url	object	只有一个int字段，processed_entity：某个算子被执行的行数
trad_to_simp	object	只有一个int字段，processed_entity：某个算子被执行的行数
remove_id_card	object	只有一个int字段，processed_entity：某个算子被执行的行数
remove_phone_number	object	只有一个int字段，processed_entity：某个算子被执行的行数
remove_exception_char	object	只有一个int字段，processed_entity：某个算子被执行的行数
replace_sim2trad	object	只有一个int字段，processed_entity：某个算子被执行的行数
replace_trad2sim	object	只有一个int字段，processed_entity：某个算子被执行的行数
replace_upper2lower	object	只有一个int字段，processed_entity：某个算子被执行的行数
cut	object	裁剪，说明： · remaining_entity：剩余样本数量 · unprocessed_entity：尚未清洗样本
failReason	string	失败原因
pauseReason	string	暂停原因

pipeline_stage_result说明

名称	类型	描述
clean	object	数据清洗clean阶段执行结果
deduplication	object	数据清洗deduplication阶段执行结果
desensitization	object	数据清洗desensitization阶段执行结果
filter	object	数据清洗filter阶段执行结果

执行结果说明

clean、deduplication、desensitization、filter 阶段执行结果字段相同，如下

名称	类型	描述
status	string	数据清洗某阶段执行结果，例："Success"
operator_count	int	该阶段算子数
entity_match_count	int	匹配到的样本数量
each_operator_result	object[]	具体到算子的清洗结果列表

each_operator_result 说明

名称	类型	描述
name	string	算子名称
remaining_count	int	通过该算子清洗后剩余样本数
drop_count	int	通过该算子清洗掉的样本数

删除数据清洗任务

数据增强V1

百度智能云

千帆大模型服务与开发平台ModelBuilder