创建数据清洗任务

更新时间：2025-04-16

功能介绍

用于创建数据清洗任务。

使用说明

本文API支持通过Python SDK、Go SDK、Java SDK 和 Node.js SDK调用，调用流程请参考SDK安装及使用流程。

权限说明

调用本文API，需符合以下权限要求，权限介绍及分配，请查看角色与权限控制列表、账号创建与权限分配。需具有以下任一权限：

完全控制千帆大模型平台的权限：QianfanFullControlAccessPolicy
完全控制千帆大模型平台数据管理（除数据标注外）的权限：QianfanDataFullControlAccessPolicy

SDK调用

在千帆平台创建一个数据清洗任务，需要提供源数据集版本 ID source_dataset_id ，目标数据集版本 ID destination_dataset_id 和数据清洗使用的算子参数字典 operations。

调用示例

import os
from qianfan  import resources

# 通过环境变量初始化认证信息
# 使用安全认证AK/SK调用，替换下列示例中参数，安全认证Access Key替换your_iam_ak，Secret Key替换your_iam_sk，如何获取请查看https://cloud.baidu.com/doc/Reference/s/9jwvz2egb
os.environ["QIANFAN_ACCESS_KEY"] = "your_iam_ak"
os.environ["QIANFAN_SECRET_KEY"] = "your_iam_sk"



resp = resources.console.utils.call_action(
    # 调用本文API，该参数值为固定值，无需修改；对应API调用文档-请求结构-请求地址的后缀
    "/wenxinworkshop/etl/create", "", 
    # 请查看本文请求参数说明，根据实际使用选择参数；对应API调用文档-请求参数-Body参数
    {
        "etlTaskName": "清洗任务1",
        "sourceDatasetId": "ds-9tff1q3h7ngdmgh4",
        "destDatasetId": "ds-3tfe1q3g7ncdmsh2",
        "operationsV2": {
            "clean": [{
                "name": "remove_invisible_character",
                "args": {}
            }, {
                "name": "replace_uniform_whitespace",
                "args": {}
            }, {
                "name": "remove_non_meaning_characters",
                "args": {}
            }, {
                "name": "replace_traditional_chinese_to_simplified",
                "args": {}
            }, {
                "name": "remove_web_identifiers",
                "args": {}
            }, {
                "name": "remove_emoji",
                "args": {}
            }],
            "deduplication": [{
                "name": "deduplication_simhash",
                "args": {
                    "distance": 4
                }
            }],
            "desensitization": [{
                "name": "replace_emails",
                "args": {}
            }, {
                "name": "replace_ip",
                "args": {}
            }, {
                "name": "replace_identifier",
                "args": {}
            }],
            "filter": [{
                "name": "filter_check_number_words",
                "args": {
                    "number_words_min_cutoff": 1,
                    "number_words_max_cutoff": 10000
                }
            }, {
                "name": "filter_check_word_repetition_removal",
                "args": {
                    "word_repetition_max_cutoff": 0.96
                }
            }, {
                "name": "filter_check_special_characters",
                "args": {
                    "special_characters_max_cutoff": 0.3
                }
            }, {
                "name": "filter_check_flagged_words",
                "args": {
                    "flagged_words_max_cutoff": 0.3
                }
            }]
        },
        "entityType": 2
    }
    
)

print(resp.body)

package main
import (
    "context"
    "fmt"
    "os"
    "github.com/baidubce/bce-qianfan-sdk/go/qianfan"
)
func main() {
     // 使用安全认证AK/SK鉴权，通过环境变量初始化；替换下列示例中参数，安全认证Access Key替换your_iam_ak，Secret Key替换your_iam_sk
    os.Setenv("QIANFAN_ACCESS_KEY", "your_iam_ak")
    os.Setenv("QIANFAN_SECRET_KEY", "your_iam_sk")
    
    ca := qianfan.NewConsoleAction()
    
    res, err := ca.Call(context.TODO(),
    // 调用本文API，该参数值为固定值，无需修改；对应API调用文档-请求结构-请求地址的后缀
    "/wenxinworkshop/etl/create", "",
    // 请查看本文请求参数说明，根据实际使用选择参数；对应API调用文档-请求参数-Body参数
    map[string]interface{}{
                "etlTaskName": "清洗任务1",
                "sourceDatasetId": "ds-9tff1q3h7ngdmgh4",
                "destDatasetId": "ds-3tfe1q3g7ncdmsh2",
                "operationsV2": map[string]any{
                                    "clean": []map[string]any{{
                                        "name": "remove_invisible_character",
                                        "args": map[string]any{},
                                    }, {
                                        "name": "replace_uniform_whitespace",
                                        "args": map[string]any{},
                                    }, {
                                        "name": "remove_non_meaning_characters",
                                        "args": map[string]any{},
                                    }, {
                                        "name": "replace_traditional_chinese_to_simplified",
                                        "args": map[string]any{},
                                    }, {
                                        "name": "remove_web_identifiers",
                                        "args": map[string]any{},
                                    }, {
                                        "name": "remove_emoji",
                                        "args": map[string]any{},
                                    }},
                                    "deduplication": []map[string]any{{
                                        "name": "deduplication_simhash",
                                        "args": map[string]any{
                                            "distance": 4,
                                        },
                                    }},
                                    "desensitization": []map[string]any{{
                                        "name": "replace_emails",
                                        "args": map[string]any{},
                                    }, {
                                        "name": "replace_ip",
                                        "args": map[string]any{},
                                    }, {
                                        "name": "replace_identifier",
                                        "args": map[string]any{},
                                    }},
                                    "filter": []map[string]any{{
                                        "name": "filter_check_number_words",
                                        "args": map[string]any{
                                            "number_words_min_cutoff": 1,
                                            "number_words_max_cutoff": 10000,
                                        },
                                    }, {
                                        "name": "filter_check_word_repetition_removal",
                                        "args": map[string]any{
                                            "word_repetition_max_cutoff": 0.96,
                                        },
                                    }, {
                                        "name": "filter_check_special_characters",
                                        "args": map[string]any{
                                            "special_characters_max_cutoff": 0.3,
                                        },
                                    }, {
                                        "name": "filter_check_flagged_words",
                                        "args": map[string]any{
                                            "flagged_words_max_cutoff": 0.3,
                                        },
                                    }},
                                },
                "entityType": 2,
    })
    if err != nil {
        panic(err)
    }
    fmt.Println(string(res.Body))
    
}

import com.baidubce.qianfan.Qianfan;
import com.baidubce.qianfan.model.console.ConsoleResponse;
import com.baidubce.qianfan.util.CollUtils;
import com.baidubce.qianfan.util.Json;
import java.util.Map;

public class Dome {
    public static void main(String args[]){
        // 使用安全认证AK/SK鉴权，替换下列示例中参数，安全认证Access Key替换your_iam_ak，Secret Key替换your_iam_sk
        Qianfan qianfan = new Qianfan("your_iam_ak", "your_iam_sk");
        
        ConsoleResponse<Map<String, Object>> response = qianfan.console()
                // 调用本文API，该参数值为固定值，无需修改；对应API调用文档-请求结构-请求地址的后缀
                .route("/wenxinworkshop/etl/create")
                // 需要传入参数的场景，可以自行封装请求类，或者使用Map.of()来构建请求Body
                // Java 8可以使用SDK提供的CollUtils.mapOf()来替代Map.of()
                // 请查看本文请求参数说明，根据实际使用选择参数；对应API调用文档-请求参数-Body参数
                .body(CollUtils.mapOf(
                            "etlTaskName", "清洗任务1",
                            "sourceDatasetId", "ds-9tff1q3h7ngdmgh4",
                            "destDatasetId", "ds-3tfe1q3g7ncdmsh2",
                            "operationsV2", CollUtils.mapOf(
                                "clean", new Map[]{CollUtils.mapOf(
                                        "name", "remove_invisible_character",
                                        "args", CollUtils.mapOf()
                                    ), CollUtils.mapOf(
                                        "name", "replace_uniform_whitespace",
                                        "args", CollUtils.mapOf()
                                    ), CollUtils.mapOf(
                                        "name", "remove_non_meaning_characters",
                                        "args", CollUtils.mapOf()
                                    ), CollUtils.mapOf(
                                        "name",     "replace_traditional_chinese_to_simplified",
                                        "args", CollUtils.mapOf()
                                    ), CollUtils.mapOf(
                                         "name", "remove_web_identifiers",
                                        "args", CollUtils.mapOf()
                                    ), CollUtils.mapOf(
                                        "name", "remove_emoji",
                                        "args", CollUtils.mapOf()
                                    )},
                                    "deduplication", new Map[]{CollUtils.mapOf(
                                        "name", "deduplication_simhash",
                                        "args", CollUtils.mapOf(
                                            "distance", 4
                                )
                            )},
                            "desensitization", new Map[]{CollUtils.mapOf(
                                        "name", "replace_emails",
                                        "args", CollUtils.mapOf()
                                ), CollUtils.mapOf(
                                        "name", "replace_ip",
                                        "args", CollUtils.mapOf()
                                ), CollUtils.mapOf(
                                        "name", "replace_identifier",
                                        "args", CollUtils.mapOf()
                                )},
                            "filter", new Map[]{CollUtils.mapOf(
                                        "name","filter_check_number_words",
                                        "args", CollUtils.mapOf(
                                            "number_words_min_cutoff", 1,
                                            "number_words_max_cutoff", 10000
                                        )
                            ), CollUtils.mapOf(
                                        "name", "filter_check_word_repetition_removal",
                                        "args", CollUtils.mapOf(
                                        "word_repetition_max_cutoff", 0.96
                                )
                            ), CollUtils.mapOf(
                                        "name", "filter_check_special_characters",
                                        "args", CollUtils.mapOf(
                                        "special_characters_max_cutoff", 0.3
                                )
                            ),CollUtils.mapOf(
                                        "name", "filter_check_flagged_words",
                                        "args", CollUtils.mapOf(
                                        "flagged_words_max_cutoff", 0.3
                                        )
                            )}
                    ),
                    "entityType", 2
                ))
                .execute();

        System.out.println(Json.serialize(response));
    }
}

import {consoleAction, setEnvVariable} from "@baiducloud/qianfan";

// 使用安全认证AK/SK鉴权，通过环境变量初始化；替换下列示例中参数，安全认证Access Key替换your_iam_ak，Secret Key替换your_iam_sk
setEnvVariable('QIANFAN_ACCESS_KEY','your_iam_ak');
setEnvVariable('QIANFAN_SECRET_KEY','your_iam_sk');

async function main() {
  //base_api_route:调用本文API，该参数值为固定值，无需修改；对应API调用文档-请求结构-请求地址的后缀
  //data:请查看本文请求参数说明，根据实际使用选择参数；对应API调用文档-请求参数-Body参数
  const res = await consoleAction({base_api_route: '/wenxinworkshop/etl/create',  data: {
        "etlTaskName": "清洗任务1",
        "sourceDatasetId": "ds-9tff1q3h7ngdmgh4",
        "destDatasetId": "ds-3tfe1q3g7ncdmsh2",
        "operationsV2": {
            "clean": [{
                "name": "remove_invisible_character",
                "args": {}
            }, {
                "name": "replace_uniform_whitespace",
                "args": {}
            }, {
                "name": "remove_non_meaning_characters",
                "args": {}
            }, {
                "name": "replace_traditional_chinese_to_simplified",
                "args": {}
            }, {
                "name": "remove_web_identifiers",
                "args": {}
            }, {
                "name": "remove_emoji",
                "args": {}
            }],
            "deduplication": [{
                "name": "deduplication_simhash",
                "args": {
                    "distance": 4
                }
            }],
            "desensitization": [{
                "name": "replace_emails",
                "args": {}
            }, {
                "name": "replace_ip",
                "args": {}
            }, {
                "name": "replace_identifier",
                "args": {}
            }],
            "filter": [{
                "name": "filter_check_number_words",
                "args": {
                    "number_words_min_cutoff": 1,
                    "number_words_max_cutoff": 10000
                }
            }, {
                "name": "filter_check_word_repetition_removal",
                "args": {
                    "word_repetition_max_cutoff": 0.96
                }
            }, {
                "name": "filter_check_special_characters",
                "args": {
                    "special_characters_max_cutoff": 0.3
                }
            }, {
                "name": "filter_check_flagged_words",
                "args": {
                    "flagged_words_max_cutoff": 0.3
                }
            }]
        },
        "entityType": 2
    }
  });    
    
  console.log(res);
}

main();

返回示例

{
	"log_id": "i9vswaefzbqpu92d",
	"result": "task-wtff1q3h7nfd3g54",
	"status": 200,
	"success": True
}

{
	"log_id": "i9vswaefzbqpu92d",
	"result": "task-wtff1q3h7nfd3g54",
	"status": 200,
	"success": true
}

{
	"log_id": "i9vswaefzbqpu92d",
	"result": "task-wtff1q3h7nfd3g54",
	"status": 200,
	"success": true
}

{
	log_id: 'i9vswaefzbqpu92d',
	result: 'task-wtff1q3h7nfd3g54',
	status: 200,
	success: true
}

请求参数

名称	必填	类型	描述
etlTaskName	是	string	清洗任务名称
sourceDatasetId	是	string	清洗前的源数据集版本ID，说明：该字段新增支持string类型；如果之前接入时使用的int类型，建议变更为string类型，后续将逐渐废弃int类型
destDatasetId	是	string	清洗后的目标数据集版本ID，说明：该字段新增支持string类型；如果之前接入时使用的int类型，建议变更为string类型，后续将逐渐废弃int类型
entityType	是	int	清洗样本类型，固定值为2，表示文本
operationsV2	是	map[string][]operationV2	清洗配置，说明：（1）key为string，需包括以下值： · 清洗：clean · 过滤：filter · 去重：deduplication · 去隐私：desensitization （2）value为list，值为单个阶段用户所选择的所有算子组成的列表 · 列表中的每个元素，对应某个算子的配置，格式参考operationsV2说明 · 如果用户没有在对应阶段选择任何算子，则value为空列表

operationV2说明

名称	类型	描述
name	string	算子名称，见各阶段name值和args值（1）Clean清洗阶段算子 · remove_emoji：去除文档中的表情等 · remove_invisible_character：移除ASCII中的一些不可见字符, 如0-32 和127-160这两个范围 · replace_uniform_whitespace：将不同的unicode空格比如 u2008，转成正常的空格 · remove_non_meaning_characters：去除乱码和无意义的unicode · replace_traditional_chinese_to_simplified：繁体转简体，如“不經意，妳的笑容”清洗成“不经意，你的笑容” · remove_web_identifiers：移除文档中的html标签，如`<html>,<dev>,<p>`等（2）Filter过滤阶段算子 · filter_check_number_words：检查文档的词数目，词数目不在指定范围会被过滤掉，如中文[1,10000] · filter_check_word_repetition_removal：检查文档的词重复率，如果词重复率太高，意味着文档中重复的词太多，文档会被过滤掉 · filter_check_character_repetition_removal：检查文档的字重复率，如果字重复率太高，意味着文档中重复的字太多，文档会被过滤掉 · filter_check_special_characters：检查文档的特殊字符率，如果特殊字符率太高，意味着文档中特殊字符太多，文档会被过滤掉 ·filter_check_flagged_words：检查文档的色情暴力词率,如果色情暴力词率太高，文档会被过滤掉 · filter_check_lang_id：检查文档的语言概率,如果语言概率太低，文档会被过滤掉 · filter_check_perplexity：检查文档的困惑度,如果困惑度太高，文档会被过滤掉（3）Deduplication去重阶段算子 · deduplication_simhash：根据海明距离计算文档相似度, 相似度<=海明距离，认为两个文档相似。（4）Desensitization 去隐私阶段算子 · replace_emails：去除email地址 · replace_ip：去除IPv4 或者 IPv6 地址 · replace_identifier：去除数字和字母数字标识符，如电话号码、信用卡号、十六进制散列等，同时跳过年份和简单数字的实例
args	object	算子参数，格式随参数名称而变化，见各阶段name值对应的args说明： · 当name为Clean清洗阶段算子，args值为空 · 当name为Desensitization 去隐私阶段算子，args值为空 · 当name为Deduplication或Desensitization，请查看args说明

args说明

当name为Clean清洗阶段算子，args值为空
当name为Desensitization 去隐私阶段算子，args值为空
当name为Deduplication去重阶段算子，args说明如下

名称	类型	描述
distance	int	范围4-6

当name为Filter过滤阶段算子，args说明如下

名称	类型	描述
number_words_min_cutoff	float	最小词数目 · 范围为[1,10000] · 当name=filter_check_number_words，该字段必填
number_words_max_cutoff	float	最大词数目 · 范围为[1,10000] · 当name=filter_check_number_words，该字段必填
word_repetition_max_cutoff	float	文档的词重复率 · 范围为0-1 · 当name=filter_check_word_repetition_removal，该字段必填
default_character_repetition_max_cutoff	float	文档的字重复率 · 范围为0-1 · 当name=filter_check_character_repetition_removal，该字段必填
special_characters_max_cutoff	float	检查文档的特殊字符率，如果特殊字符率太高，意味着文档中特殊字符太多，文档会被过滤掉 · 范围为0-1 · 当name=filter_check_special_characters，该字段必填
flagged_words_max_cutoff	float	检查文档的色情暴力词率,如果色情暴力词率太高，文档会被过滤掉 ·范围为0-1 · 当name=filter_check_flagged_words，该字段必填
lang_id_min_cutoff	float	检查文档的语言概率,如果语言概率太低，文档会被过滤掉 · 范围为0-1 · 当name=filter_check_lang_id，该字段必填
perplexity_max_cutoff	float	检查文档的困惑度,如果困惑度太高，文档会被过滤掉 ·范围为1-5000 · 当name=filter_check_perplexity，该字段必填

返回参数

名称	类型	描述
log_id	string	操作记录id
result	string	清洗任务序号
status	int	状态码
success	bool	是否操作成功，说明： · true：成功 · false：失败

数据集管理

查看数据清洗任务详情

百度智能云

千帆大模型服务与开发平台ModelBuilder