高级用法
混合序列并行策略加速长文本预训练
参数配置说明:序列并行参数具体见aiak_training_llm/train/arguments.py
文件。其中:
--context-parallel-size
:指定序列并行维度,该维度是ulysses和ring attention两种混合序列并行策略之和。--context-parallel-ulysses-degree
:在context-parallel-size基础上
,设置 ulysses 并行维度,默认值为1,即仅使用 megatron 原生context parallel 算法。如果两者设置大小一样,则仅使用 ulysses 算法策略;
**LLaMA2 70B**
长序列训练示例
#! /bin/bash
MEGATRON_PATH=${MEGATRON_PATH:-"/workspace/AIAK-Megatron"}
AIAK_TRAINING_PATH=${AIAK_TRAINING_PATH:-"/workspace/AIAK-Training-LLM"}
DATA_PATH=${DATA_PATH:-"/mnt/cluster/llama2/pile/pile-llama_text_document"}
TOKENIZER_PATH=${TOKENIZER_PATH:-"/mnt/cluster/llama2/Llama-2-70b-hf/"}
CHECKPOINT_PATH=${CHECKPOINT_PATH:-"/mnt/cluster/llama2/mcore_llama2_70b_tp4_pp4"}
TENSORBOARD_PATH=${TENSORBOARD_PATH:-"/mnt/cluster/aiak-training-llm/tensorboard-log/llama2-70b-tp4pp4"}
GPUS_PER_NODE=8
# Change for multinode config
MASTER_ADDR=${MASTER_ADDR:-"localhost"}
MASTER_PORT=${MASTER_PORT:-"6000"}
NNODES=${WORLD_SIZE:-"1"}
NODE_RANK=${RANK:-"0"}
DISTRIBUTED_ARGS=(
--nproc_per_node $GPUS_PER_NODE
--nnodes $NNODES
--node_rank $NODE_RANK
--master_addr $MASTER_ADDR
--master_port $MASTER_PORT
)
# you can setup llama2-70b maunally
#MODEL_ARGS=(
# --model-name llama2
# --num-layers 80
# --hidden-size 8192
# --ffn-hidden-size 28672
# --num-attention-heads 64
# --position-embedding-type rope
# --normalization RMSNorm
# --swiglu
# --attention-dropout 0
# --hidden-dropout 0
# --disable-bias-linear
# --untie-embeddings-and-output-weights
# --group-query-attention
# --num-query-groups 8
#)
# or you can setup llama2-70b by using the following command
MODEL_ARGS=(
--model-name llama2-70b # options: llama2-7b, llama2-13b, llama2-70b
)
DATA_ARGS=(
--tokenizer-type HFTokenizer
--hf-tokenizer-path $TOKENIZER_PATH
--data-path $DATA_PATH
--split 949,50,1
)
TRAINING_ARGS=(
--training-phase pretrain # options: pretrain, sft
--seq-length 32768
--max-position-embeddings 32768
--init-method-std 0.01
--micro-batch-size 1
--global-batch-size 1024
--lr 0.0002
--min-lr 1.0e-5
--clip-grad 1.0
--weight-decay 0.01
--optimizer adam
--adam-beta1 0.9
--adam-beta2 0.95
--adam-eps 1e-05
--norm-epsilon 1e-6
--train-iters 500000
--lr-decay-iters 500000
--lr-decay-style cosine
--lr-warmup-fraction 0.002
--initial-loss-scale 65536
--fp16
--load $CHECKPOINT_PATH
--save $CHECKPOINT_PATH
--save-interval 5000
--eval-interval 1000
--eval-iters 10
#--ckpt-step 0
#--no-load-optim
#--no-load-rng
)
MODEL_PARALLEL_ARGS=(
--tensor-model-parallel-size 4
--pipeline-model-parallel-size 4
--context-parallel-size 4
--context-parallel-ulysses-degree 2
--use-distributed-optimizer
--overlap-grad-reduce
--overlap-param-gather
--distributed-backend nccl
--sequence-parallel
#--tp-comm-overlap # require mpi envrionment
)
LOGGING_ARGS=(
--log-interval 1
--tensorboard-dir ${TENSORBOARD_PATH}
--log-timers-to-tensorboard
)
if [ -n "${WANDB_API_KEY}" ]; then
LOGGING_ARGS+=(
--wandb-project ${WANDB_PROJECT}
--wandb-exp-name ${WANDB_NAME}
)
fi
PYTHONPATH=$MEGATRON_PATH:$AIAK_TRAINING_PATH:$PYTHONPATH \
torchrun ${DISTRIBUTED_ARGS[@]} \
$AIAK_TRAINING_PATH/aiak_training_llm/train.py \
${MODEL_ARGS[@]} \
${DATA_ARGS[@]} \
${TRAINING_ARGS[@]} \
${MODEL_PARALLEL_ARGS[@]} \
${LOGGING_ARGS[@]}
混合序列并行策略加速长文本SFT训练
参数配置说明:序列并行参数具体见aiak_training_llm/train/arguments.py文件。
其中:
- --context-parallel-size:指定序列并行维度,该维度是ulysses和ring attention两种混合序列并行策略之和。
- --context-parallel-ulysses-degree:在context-parallel-size基础上,设置 ulysses 并行维度,默认值为1,即仅使用 megatron 原生context parallel 算法。如果两者设置大小一样,则仅使用 ulysses 算法策略;
-
--sft-sort-batch:对整个数据集按照样本长度从小到大排序。用于解决数据并行中样本长度不一致导致的长尾问题,注意:数据顺序可能影响模型效果,谨慎分析后使用。
- 当开启--packing-sft-data时,则不再对整个数据集排序,而是对pack后的数据排序 (根据pack中包含的每个样本计算量)。
- --packing-batch-size:默认10000,用于 --packing-sft-data功能,用于决定参与单次拼接的候选样本数量;
Qwen2.5 72B + 128K长序列SFT训练示例:
#! /bin/bash
# The script needs to be run on at least 16 nodes.
MEGATRON_PATH=${MEGATRON_PATH:-"/workspace/AIAK-Megatron"}
AIAK_TRAINING_PATH=${AIAK_TRAINING_PATH:-"/workspace/AIAK-Training-LLM"}
DATA_PATH=${DATA_PATH:-"/mnt/cluster/aiak-training-llm/dataset/sft_aplaca_zh_data.json"}
#DATA_PATH=${DATA_PATH:-"/mnt/cluster/aiak-training-llm/qwen2.5/sft_aplaca_zh_tokenized"}
DATA_CACHE_PATH=${DATA_CACHE_PATH:-"/mnt/cluster/aiak-training-llm/qwen2.5/sft_aplaca_zh_data_cache"}
DATASET_CONFIG_PATH=${DATASET_CONFIG_PATH:-"/workspace/AIAK-Training-LLM/configs/sft_dataset_config.json"}
TOKENIZER_PATH=${TOKENIZER_PATH:-"/mnt/cluster/huggingface.co/Qwen/Qwen2.5-72B"}
CHECKPOINT_PATH=${CHECKPOINT_PATH:-"/mnt/cluster/aiak-training-llm/qwen2.5/Qwen2.5_72B_mcore_tp4pp8"}
TENSORBOARD_PATH=${TENSORBOARD_PATH:-"/mnt/cluster/aiak-training-llm/tensorboard-log/qwen2.5-72b-sft"}
GPUS_PER_NODE=8
# Change for multinode config
MASTER_ADDR=${MASTER_ADDR:-"localhost"}
MASTER_PORT=${MASTER_PORT:-"6000"}
NNODES=${WORLD_SIZE:-"1"}
NODE_RANK=${RANK:-"0"}
DISTRIBUTED_ARGS=(
--nproc_per_node $GPUS_PER_NODE
--nnodes $NNODES
--node_rank $NODE_RANK
--master_addr $MASTER_ADDR
--master_port $MASTER_PORT
)
MODEL_ARGS=(
--model-name qwen2.5-72b # qwen2.5 options: 0.5b, 1.5b, 3b, 7b, 14b, 32b, 72b
--rotary-base 1000000
--rotary-seq-len-interpolation-factor 1
)
DATA_ARGS=(
--tokenizer-type HFTokenizer
--hf-tokenizer-path $TOKENIZER_PATH
--data-path $DATA_PATH
--split 100,0,0
)
SFT_ARGS=(
--chat-template qwen
--sft-num-preprocess-workers 16
--no-check-for-nan-in-loss-and-grad
#--is-tokenized-data
--packing-sft-data
#--sft-sort-batch
#--packing-batch-size 10000
#--sft-data-streaming
#--train-on-prompt
#--eod-mask-loss
#--sft-dataset-config $DATASET_CONFIG_PATH
#--sft-dataset sft_aplaca_zh_data # defined in --sft-dataset-config, default: default
#--data-cache-path $DATA_CACHE_PATH
)
TRAINING_ARGS=(
--training-phase sft # options: pretrain, sft
--seq-length 131072
--max-position-embeddings 131072
--init-method-std 0.006
--micro-batch-size 1
--global-batch-size 128
--lr 1.0e-5
--min-lr 1.0e-6
--clip-grad 1.0
--weight-decay 0.1
--optimizer adam
--adam-beta1 0.9
--adam-beta2 0.95
--adam-eps 1e-08
--norm-epsilon 1e-6
--train-iters 5000
--lr-decay-iters 5000
--lr-decay-style cosine
--lr-warmup-fraction 0.002
--initial-loss-scale 65536
--bf16
--load $CHECKPOINT_PATH
--save $CHECKPOINT_PATH
--save-interval 500
--eval-interval 100
--eval-iters 10
#--ckpt-step 0
#--no-load-optim
#--no-load-rng
#--num-workers 8
)
MODEL_PARALLEL_ARGS=(
--tensor-model-parallel-size 4
--pipeline-model-parallel-size 8
--use-distributed-optimizer
--overlap-grad-reduce
--overlap-param-gather
--distributed-backend nccl
--sequence-parallel
--tp-comm-overlap
--context-parallel-size 8
--context-parallel-ulysses-degree 8
--recompute-granularity full
--recompute-method block
--recompute-num-layers 13
--offload-optimizer manual
--offload-optimizer-percent 1.0
)
LOGGING_ARGS=(
--log-interval 1
--tensorboard-dir ${TENSORBOARD_PATH}
--log-timers-to-tensorboard
)
if [ -n "${WANDB_API_KEY}" ]; then
LOGGING_ARGS+=(
--wandb-project ${WANDB_PROJECT}
--wandb-exp-name ${WANDB_NAME}
)
fi
PYTHONPATH=$MEGATRON_PATH:$AIAK_TRAINING_PATH:$PYTHONPATH \
torchrun ${DISTRIBUTED_ARGS[@]} \
$AIAK_TRAINING_PATH/aiak_training_llm/train.py \
${MODEL_ARGS[@]} \
${DATA_ARGS[@]} \
${TRAINING_ARGS[@]} \
${SFT_ARGS[@]} \
${MODEL_PARALLEL_ARGS[@]} \
${LOGGING_ARGS[@]}
流水线并行策略不均衡切分
-
--custom-pipeline-layers:指定流水线并行各 pp stage 具体 layer 数,不开启该功能默认均分。
- 使用方法:假设 PP=4,参数配置--custom-pipeline-layers=19,20,20,21,则表示第一个stage层数为19,最后一个stage层数为21,其余stage层数为20。
-
--custom-pipeline-recompute-layers:在开启流水线并行以及重计算情况下使用,指定流水线并行各stage 做重计算的layer数,用于均衡各stage显存(该参数主要用于长序列等对显存需要精细调整的场景)。
-
使用方法:
- 当前 PP Stage 存在显存不均衡问题,第一个stage 一般显存占用最多,需要开启更多的重计算layer数;而最后一个stage占用显存最少,可以开启最少的重计算层数或者不开重计算。
- 假设PP=4,参数配置--custom-pipeline-recompute-layers=20,15,15,0,表示第一个stage开启重计算层数为20,最后一个stage重计算层数为0,其余stage开启重计算层数为15;
-
qwen-72b长序列预训练示例:
#! /bin/bash
# The script needs to be run on at least 16 nodes.
MEGATRON_PATH=${MEGATRON_PATH:-"/workspace/AIAK-Megatron"}
AIAK_TRAINING_PATH=${AIAK_TRAINING_PATH:-"/workspace/AIAK-Training-LLM"}
DATA_PATH=/mnt/cluster/qwen/qwen-data/qwen_00_text_document_content_sentence
TOKENIZER_PATH=/mnt/cluster/huggingface.co/Qwen/Qwen-72B/
CHECKPOINT_PATH=/mnt/cluster/qwen/megatron_qwen_72b_checkpoint/
TENSORBOARD_PATH=/mnt/cluster/qwen/tensorboard-log/qwen-72b
GPUS_PER_NODE=8
# Change for multinode config
MASTER_ADDR=${MASTER_ADDR:-"localhost"}
MASTER_PORT=${MASTER_PORT:-"6000"}
NNODES=${WORLD_SIZE:-"1"}
NODE_RANK=${RANK:-"0"}
DISTRIBUTED_ARGS=(
--nproc_per_node $GPUS_PER_NODE
--nnodes $NNODES
--node_rank $NODE_RANK
--master_addr $MASTER_ADDR
--master_port $MASTER_PORT
)
MODEL_ARGS=(
--model-name qwen-72b # qwen2.5 options: 0.5b, 1.5b, 3b, 7b, 14b, 32b, 72b
--rotary-base 1000000
--rotary-seq-len-interpolation-factor 1
)
DATA_ARGS=(
--tokenizer-type HFTokenizer
--hf-tokenizer-path $TOKENIZER_PATH
--data-path $DATA_PATH
--split 100,0,0
)
TRAINING_ARGS=(
--training-phase pretrain # options: pretrain, sft
--seq-length 32768
--max-position-embeddings 32768
--init-method-std 0.006
--micro-batch-size 1
--global-batch-size 128
--lr 1.0e-5
--min-lr 1.0e-6
--clip-grad 1.0
--weight-decay 0.1
--optimizer adam
--adam-beta1 0.9
--adam-beta2 0.95
--adam-eps 1e-08
--norm-epsilon 1e-6
--train-iters 5000
--lr-decay-iters 5000
--lr-decay-style cosine
--lr-warmup-fraction 0.002
--initial-loss-scale 65536
--bf16
--load $CHECKPOINT_PATH
--save $CHECKPOINT_PATH
--save-interval 500
--eval-interval 100
--eval-iters 10
#--ckpt-step 0
#--no-load-optim
#--no-load-rng
#--num-workers 8
)
MODEL_PARALLEL_ARGS=(
--tensor-model-parallel-size 4
--pipeline-model-parallel-size 4
--use-distributed-optimizer
--overlap-grad-reduce
--overlap-param-gather
--distributed-backend nccl
--sequence-parallel
--tp-comm-overlap
--context-parallel-size 2
--context-parallel-ulysses-degree 1
--recompute-granularity full
--recompute-method block
--recompute-num-layers 13
--custom-pipeline-recompute-layers 11,7,3,0
--custom-pipeline-layers 18,19,21,22
--offload-optimizer manual
--offload-optimizer-percent 1.0
)
LOGGING_ARGS=(
--log-interval 1
--tensorboard-dir ${TENSORBOARD_PATH}
--log-timers-to-tensorboard
)
if [ -n "${WANDB_API_KEY}" ]; then
LOGGING_ARGS+=(
--wandb-project ${WANDB_PROJECT}
--wandb-exp-name ${WANDB_NAME}
)
fi
PYTHONPATH=$MEGATRON_PATH:$AIAK_TRAINING_PATH:$PYTHONPATH \
torchrun ${DISTRIBUTED_ARGS[@]} \
$AIAK_TRAINING_PATH/aiak_training_llm/train.py \
${MODEL_ARGS[@]} \
${DATA_ARGS[@]} \
${TRAINING_ARGS[@]} \
${MODEL_PARALLEL_ARGS[@]} \
${LOGGING_ARGS[@]}
CogVLM2模型训练指南
数据集格式和处理
当前版本主要参考 CogVLM 提供的开源数据集示例来构建训练过程,用户可以使用 CogVLM-SFT-311K 开源数据集或自行构建相同格式的数据集进行训练微调。
数据格式说明:
当前暂支持 Caption 和 VQA 格式的数据,其中 Caption 数据用于 Pretrain 阶段,VQA 数据用于 SFT 阶段;
- 两种数据集均需按照以下示例格式来组织:
.llava_details-minigpt4_3500_formate
├── images
│ └── 00000001.jpg
└── labels
└── 00000001.json
- images文件夹中存放了图片文件,labels文件夹中存放了对应的标签文件。图片和标签文件的名称一一对应。图片文件的格式为jpg,标签文件的格式为json。
- Pretrain数据集的标签文件包含了一段说明文字,由content 字段指定
{
"captions": [
{
"role": "caption",
"content": "这张图片是一个计算机生成的场景,画面中一名女网球运动员手里拿着网球拍。网球分布在整个网球场周围,有些在她上面,有些在下面,还有一些从左右两侧向她飞来。"
}
]
}
- SFT数据集的标签文件中包含了一段对话,对话由user和assistant两个角色组成,每个角色的对话内容由role和content两个字段组成。如下字段所示。
{
"conversations": [
{
"role": "user",
"content": "图片中可以看到多少人玩滑板?"
},
{
"role": "assistant",
"content": "图片中可以看到两个人拿着滑板。"
}
...
]
注:对于多轮对话,训练时会随机挑选一轮或两轮对话参与训练,暂和CogVLM2开源逻辑保持一致(https://github.com/THUDM/CogVLM2/blob/cf9cb3c60a871e0c8e5bde7feaf642e3021153e6/finetune_demo/peft_lora.py#L79)
Checkpoint 转换
用户可以参考/workspace/AIAK-Training-LLM/examples/cogvlm2/checkpoint_convert目录下提供的示例脚本。实现方式说明:
- 由于 CogVLM 相比标准 LLM 新增了一些视觉的结构,为了方便处理转换,我们这里采用了【分模块独立转换和合并】的权重处理方式(未来可更方便的替换客户已有的 LLaMA、Vision Encoder权重);
将 Huggingface 权重转换到 MCore 格式:
#! /bin/bash
AIAK_TRAINING_PATH=${AIAK_TRAINING_PATH:-"/workspace/AIAK-Training-LLM"}
AIAK_MAGATRON_PATH=${AIAK_MESSAGE_PATH:-"/workspace/AIAK-Magatron"}
CONVERT_CHECKPOINT_PATH="$AIAK_TRAINING_PATH/tools/convert_checkpoint"
LOAD=/mnt/pfs/huggingface.co/THUDM/cogvlm2-llama3-chinese-chat-19B/
SAVE=/mnt/pfs/aiak-training-llm/cogvlm2/converted_cogvlm2-llama3-chinese-chat-19B_tp4_pp1
SAVE_LANGUAGE_EXPERT=/mnt/pfs/aiak-training-llm/cogvlm2/tmp/language-expert-mcore
SAVE_VISION_EXPERT=/mnt/pfs/aiak-training-llm/cogvlm2/tmp/vision-expert-mcore
SAVE_VISION_MODEL=/mnt/pfs/aiak-training-llm/cogvlm2/tmp/vision-model-mcore
SAVE_ADAPTER=/mnt/pfs/aiak-training-llm/cogvlm2/tmp/adapter-mcore
SAVE_PATCH=/mnt/pfs/aiak-training-llm/cogvlm2/tmp/patch-mcore
TP=4
# 第1步:转换llama3中的语言专家权重
python $CONVERT_CHECKPOINT_PATH/model.py \
--load_platform=huggingface \
--save_platform=mcore \
--common_config_path=$CONVERT_CHECKPOINT_PATH/config/cogvlm2-19b/language-expert.json \
--tensor_model_parallel_size=$TP \
--load_ckpt_path=$LOAD \
--save_ckpt_path=$SAVE_LANGUAGE_EXPERT \
--safetensors \
--no-te \
--no_save_optim \
--no_load_optim
# 第2步:转换llama3中的视觉专家权重
python $CONVERT_CHECKPOINT_PATH/model.py \
--load_platform=huggingface \
--save_platform=mcore \
--common_config_path=$CONVERT_CHECKPOINT_PATH/config/cogvlm2-19b/vision-expert.json \
--tensor_model_parallel_size=$TP \
--load_ckpt_path=$LOAD \
--save_ckpt_path=$SAVE_VISION_EXPERT \
--safetensors \
--no-te \
--no_save_optim \
--no_load_optim
# 第3步:转换视觉模型ViT中的Transformer
python $CONVERT_CHECKPOINT_PATH/model.py \
--load_platform=huggingface \
--save_platform=mcore \
--common_config_path=$CONVERT_CHECKPOINT_PATH/config/cogvlm2-19b/vision-model.json \
--tensor_model_parallel_size=$TP \
--load_ckpt_path=$LOAD \
--save_ckpt_path=$SAVE_VISION_MODEL \
--safetensors \
--no-te \
--no_save_optim \
--no_load_optim
# 第4步:转换视觉模型和语言模型的适配器adapter
python $CONVERT_CHECKPOINT_PATH/custom/cogvlm/adapter.py \
--load_platform=huggingface \
--save_platform=mcore \
--common_config_path=$CONVERT_CHECKPOINT_PATH/config/cogvlm2-19b/adapter.json \
--tensor_model_parallel_size=$TP \
--load_ckpt_path=$LOAD \
--save_ckpt_path=$SAVE_ADAPTER
# 第5步:转换视觉模型ViT中的Patch
python $CONVERT_CHECKPOINT_PATH/custom/cogvlm/vision_patch.py \
--load_platform=huggingface \
--save_platform=mcore \
--tensor_model_parallel_size=$TP \
--common_config_path=$CONVERT_CHECKPOINT_PATH/config/cogvlm2-19b/vision-patch.json \
--load_ckpt_path=$LOAD \
--save_ckpt_path=$SAVE_PATCH
# 第6步:将以上5步的产出结果进行合并
python $CONVERT_CHECKPOINT_PATH/custom/cogvlm/merge_megatron.py \
--megatron_path $AIAK_MAGATRON_PATH \
--language_expert_path $SAVE_LANGUAGE_EXPERT/release \
--vision_expert_path $SAVE_VISION_EXPERT/release \
--vision_model_path $SAVE_VISION_MODEL/release \
--vision_patch $SAVE_PATCH/release \
--adapter_path $SAVE_ADAPTER/release \
--save_ckpt_path $SAVE/release
echo release > $SAVE/latest_checkpointed_iteration.txt
# 删除前5步产出的临时结果
rm -rf $SAVE_LANGUAGE_EXPERT
rm -rf $SAVE_VISION_EXPERT
rm -rf $SAVE_VISION_MODEL
rm -rf $SAVE_ADAPTER
rm -rf $SAVE_PATCH
将 MCore 权重转换到 Huggingface格式:
#! /bin/bash
AIAK_TRAINING_PATH=${AIAK_TRAINING_PATH:-"/workspace/AIAK-Training-LLM"}
AIAK_MAGATRON_PATH=${AIAK_MESSAGE_PATH:-"/workspace/AIAK-Megatron"}
CONVERT_CHECKPOINT_PATH="$AIAK_TRAINING_PATH/tools/convert_checkpoint"
SAVE=/mnt/pfs/aiak-training-llm/cogvlm2/converted_converted_cogvlm2
LOAD=/mnt/pfs/aiak-training-llm/cogvlm2/mcore_cogvlm2_llama3_chinese_chat_19B_tp4_pp1/release/
SAVE_LANGUAGE_EXPERT=/mnt/pfs/aiak-training-llm/cogvlm2/tmp/language-expert-hf
SAVE_VISION_EXPERT=/mnt/pfs/aiak-training-llm/cogvlm2/tmp/vision-expert-hf
SAVE_VISION_MODEL=/mnt/pfs/aiak-training-llm/cogvlm2/tmp/vision-model-hf
SAVE_ADAPTER=/mnt/pfs/aiak-training-llm/cogvlm2/tmp/adapter-hf
SAVE_PATCH=/mnt/pfs/aiak-training-llm/cogvlm2/tmp/patch-hf
TP=4
# 第1步:转换llama3中的语言专家权重
python $CONVERT_CHECKPOINT_PATH/model.py \
--load_platform=mcore \
--megatron_path $AIAK_MAGATRON_PATH \
--save_platform=huggingface \
--common_config_path=$CONVERT_CHECKPOINT_PATH/config/cogvlm2-19b/language-expert.json \
--tensor_model_parallel_size=$TP \
--load_ckpt_path=$LOAD \
--save_ckpt_path=$SAVE_LANGUAGE_EXPERT \
--safetensors \
--no-te \
--no_save_optim \
--no_load_optim
# 第2步:转换llama3中的视觉专家权重
python $CONVERT_CHECKPOINT_PATH/model.py \
--load_platform=mcore \
--save_platform=huggingface \
--megatron_path $AIAK_MAGATRON_PATH \
--common_config_path=$CONVERT_CHECKPOINT_PATH/config/cogvlm2-19b/vision-expert.json \
--tensor_model_parallel_size=$TP \
--load_ckpt_path=$LOAD \
--save_ckpt_path=$SAVE_VISION_EXPERT \
--safetensors \
--no-te \
--no_save_optim \
--no_load_optim
# 第3步:转换视觉模型ViT中的Transformer
python $CONVERT_CHECKPOINT_PATH/model.py \
--load_platform=mcore \
--save_platform=huggingface \
--megatron_path $AIAK_MAGATRON_PATH \
--common_config_path=$CONVERT_CHECKPOINT_PATH/config/cogvlm2-19b/vision-model.json \
--tensor_model_parallel_size=$TP \
--load_ckpt_path=$LOAD \
--save_ckpt_path=$SAVE_VISION_MODEL \
--safetensors \
--no-te \
--no_save_optim \
--no_load_optim
# 第4步:转换视觉模型和语言模型的适配器adapter
python $CONVERT_CHECKPOINT_PATH/custom/cogvlm/adapter.py \
--load_platform=mcore \
--save_platform=huggingface \
--megatron_path $AIAK_MAGATRON_PATH \
--common_config_path=$CONVERT_CHECKPOINT_PATH/config/cogvlm2-19b/adapter.json \
--tensor_model_parallel_size=$TP \
--load_ckpt_path=$LOAD \
--save_ckpt_path=$SAVE_ADAPTER
# 第5步:转换视觉模型ViT中的Patch
python $CONVERT_CHECKPOINT_PATH/custom/cogvlm/vision_patch.py \
--load_platform=mcore \
--save_platform=huggingface \
--megatron_path $AIAK_MAGATRON_PATH \
--tensor_model_parallel_size=$TP \
--common_config_path=$CONVERT_CHECKPOINT_PATH/config/cogvlm2-19b/vision-patch.json \
--load_ckpt_path=$LOAD \
--save_ckpt_path=$SAVE_PATCH
# 第6步:将以上5步的产出结果进行合并
python $CONVERT_CHECKPOINT_PATH/custom/cogvlm/merge_huggingface.py \
--megatron_path $AIAK_MAGATRON_PATH \
--language_expert_path $SAVE_LANGUAGE_EXPERT \
--vision_expert_path $SAVE_VISION_EXPERT \
--vision_model_path $SAVE_VISION_MODEL \
--vision_patch $SAVE_PATCH \
--adapter_path $SAVE_ADAPTER \
--save_ckpt_path $SAVE
# 删除前5步产出的临时结果
rm -rf $SAVE_LANGUAGE_EXPERT
rm -rf $SAVE_VISION_EXPERT
rm -rf $SAVE_VISION_MODEL
rm -rf $SAVE_ADAPTER
rm -rf $SAVE_PATCH
运行参数说明
新增参数具体见aiak_training_llm/train/arguments.py文件。其中关键参数:
-
--trainable-modules:用于指定可训练的模块,可多选:
- all :训练所有参数(如果仅需要训练部分参数,请指定下述参数);
- vision_model:训练视觉 Encoder 模型参数,比如 ViT Encoder
- adapter:训练视觉模型和语言模型的适配器,比如MLP Adapter
- language_model:训练语言 Decoder 模型的参数(同时包括语言模型中的 visual expert);(如果需要控制不同模态的参数训练,请指定下述参数)
- vision_expert_linear:训练视觉专家线性层参数,包括:Attention中的linear_qkv和linear_proj,和MLP中的linear_fc1、linear_fc2。
- language_expert_linear:训练语言专家线性层参数, 包括:Attention中的linear_qkv、linear_proj,和MLP中的linear_fc1、linear_fc2。
注:在原始 CogVLM1的论文中,仅 MLP Adapter和 Vision Expert参与训练,那么 --trainable-modules 可设置为 adapter vision_expert_linear(与下述示例脚本一致),此时其他的权重将会 freeze,不参与参数更新;
- --no-rope-in-fp32:可选,使语言模型的中rope参数精度与模型一致,而非默认的float32。详见 https://huggingface.co/THUDM/cogvlm2-llama3-chat-19B/blob/f592f291cf528389b2e4776b1e84ecdf6d71fbe3/util.py#L376
启动示例
pretrain
#! /bin/bash
MEGATRON_PATH=${MEGATRON_PATH:-"/workspace/AIAK-Megatron"}
AIAK_TRAINING_PATH=${AIAK_TRAINING_PATH:-"/workspace/AIAK-Training-LLM"}
DATA_PATH=${DATA_PATH:-"/mnt/pfs/aiak-training-llm/cogvlm2/CogVLM-SFT-311K/llava_details-minigpt4_3500_formate/"}
TOKENIZER_PATH=${TOKENIZER_PATH:-"//mnt/pfs/huggingface.co/THUDM/cogvlm2-llama3-chinese-chat-19B/"}
CHECKPOINT_PATH=${CHECKPOINT_PATH:-"/mnt/pfs/aiak-training-llm/cogvlm2/mcore_cogvlm2_llama3_chinese_chat_19B_tp4_pp1"}
TENSORBOARD_PATH=${TENSORBOARD_PATH:-"/mnt/pfs/aiak-training-llm/tensorboard-log/cogvlm2-llama3-chinese-chat-19B-pretrain"}
GPUS_PER_NODE=8
# Change for multinode config
MASTER_ADDR=${MASTER_ADDR:-"localhost"}
MASTER_PORT=${MASTER_PORT:-"6000"}
NNODES=${WORLD_SIZE:-"1"}
NODE_RANK=${RANK:-"0"}
DISTRIBUTED_ARGS=(
--nproc_per_node $GPUS_PER_NODE
--nnodes $NNODES
--node_rank $NODE_RANK
--master_addr $MASTER_ADDR
--master_port $MASTER_PORT
)
# or you can setup llama3-8b by using the following command
MODEL_ARGS=(
--model-name cogvlm2-llama3-chinese-chat-19b
--rotary-base 500000
)
IMG_ARGS=(
--img-h 1344
--img-w 1344
--patch-dim 14
)
DATA_ARGS=(
--tokenizer-type HFTokenizer
--hf-tokenizer-path $TOKENIZER_PATH
--data-path $DATA_PATH
--split 949,50,1
)
TRAINING_ARGS=(
--training-phase pretrain
--trainable-modules vision_expert_linear adapter
--seq-length 4096
--max-position-embeddings 4096
--init-method-std 0.02
--micro-batch-size 1
--global-batch-size 2
--lr 0.0002
--min-lr 1.0e-5
--clip-grad 1.0
--weight-decay 0.01
--optimizer adam
--adam-beta1 0.9
--adam-beta2 0.95
--adam-eps 1e-05
--norm-epsilon 1e-05
--train-iters 50000
--lr-decay-iters 50000
--lr-decay-style cosine
--lr-warmup-fraction 0.002
--initial-loss-scale 65536
--bf16
--no-rope-in-fp32
--load $CHECKPOINT_PATH
--save $CHECKPOINT_PATH
--save-interval 5000
--eval-interval 1000
--eval-iters 10
--no-load-optim
--no-load-rng
#--ckpt-step 0
)
MODEL_PARALLEL_ARGS=(
--pipeline-model-parallel-size 1
--tensor-model-parallel-size 4
--use-distributed-optimizer
--overlap-grad-reduce
--overlap-param-gather
--distributed-backend nccl
#--sequence-parallel
)
LOGGING_ARGS=(
--log-interval 1
--tensorboard-dir ${TENSORBOARD_PATH}
--log-timers-to-tensorboard
)
if [ -n "${WANDB_API_KEY}" ]; then
LOGGING_ARGS+=(
--wandb-project ${WANDB_PROJECT}
--wandb-exp-name ${WANDB_NAME}
)
fi
PYTHONPATH=$MEGATRON_PATH:$AIAK_TRAINING_PATH:$PYTHONPATH \
torchrun ${DISTRIBUTED_ARGS[@]} \
$AIAK_TRAINING_PATH/aiak_training_llm/train.py \
${MODEL_ARGS[@]} \
${DATA_ARGS[@]} \
${IMG_ARGS[@]} \
${TRAINING_ARGS[@]} \
${MODEL_PARALLEL_ARGS[@]} \
${LOGGING_ARGS[@]}
SFT
#! /bin/bash
MEGATRON_PATH=${MEGATRON_PATH:-"/workspace/AIAK-Megatron"}
AIAK_TRAINING_PATH=${AIAK_TRAINING_PATH:-"/workspace/AIAK-Training-LLM"}
DATA_PATH=${DATA_PATH:-"/mnt/pfs/aiak-training-llm/cogvlm2/CogVLM-SFT-311K/llava_instruction_multi_conversations_formate/"}
TOKENIZER_PATH=${TOKENIZER_PATH:-"//mnt/pfs/huggingface.co/THUDM/cogvlm2-llama3-chinese-chat-19B/"}
CHECKPOINT_PATH=${CHECKPOINT_PATH:-"/mnt/pfs/aiak-training-llm/cogvlm2/mcore_cogvlm2_llama3_chinese_chat_19B_tp4_pp1"}
TENSORBOARD_PATH=${TENSORBOARD_PATH:-"/mnt/pfs/aiak-training-llm/tensorboard-log/cogvlm2-llama3-chinese-chat-19B-pretrain"}
GPUS_PER_NODE=8
# Change for multinode config
MASTER_ADDR=${MASTER_ADDR:-"localhost"}
MASTER_PORT=${MASTER_PORT:-"6000"}
NNODES=${WORLD_SIZE:-"1"}
NODE_RANK=${RANK:-"0"}
DISTRIBUTED_ARGS=(
--nproc_per_node $GPUS_PER_NODE
--nnodes $NNODES
--node_rank $NODE_RANK
--master_addr $MASTER_ADDR
--master_port $MASTER_PORT
)
# or you can setup llama3-8b by using the following command
MODEL_ARGS=(
--model-name cogvlm2-llama3-chinese-chat-19b
--rotary-base 500000
)
IMG_ARGS=(
--img-h 1344
--img-w 1344
--patch-dim 14
)
DATA_ARGS=(
--tokenizer-type HFTokenizer
--hf-tokenizer-path $TOKENIZER_PATH
--data-path $DATA_PATH
--split 949,50,1
)
TRAINING_ARGS=(
--training-phase sft
--trainable-modules vision_expert_linear adapter
--chat-template empty
--seq-length 4096
--max-position-embeddings 4096
--init-method-std 0.02
--micro-batch-size 1
--global-batch-size 2
--lr 0.0002
--min-lr 1.0e-5
--clip-grad 1.0
--weight-decay 0.01
--optimizer adam
--adam-beta1 0.9
--adam-beta2 0.95
--adam-eps 1e-05
--norm-epsilon 1e-05
--train-iters 50000
--lr-decay-iters 50000
--lr-decay-style cosine
--lr-warmup-fraction 0.002
--initial-loss-scale 65536
--bf16
--no-rope-in-fp32
--load $CHECKPOINT_PATH
--save $CHECKPOINT_PATH
--save-interval 5000
--eval-interval 1000
--eval-iters 10
--no-load-optim
--no-load-rng
#--ckpt-step 0
)
MODEL_PARALLEL_ARGS=(
--tensor-model-parallel-size 4
--use-distributed-optimizer
--overlap-grad-reduce
--overlap-param-gather
--distributed-backend nccl
)
LOGGING_ARGS=(
--log-interval 1
--tensorboard-dir ${TENSORBOARD_PATH}
--log-timers-to-tensorboard
)
if [ -n "${WANDB_API_KEY}" ]; then
LOGGING_ARGS+=(
--wandb-project ${WANDB_PROJECT}
--wandb-exp-name ${WANDB_NAME}
)
fi
PYTHONPATH=$MEGATRON_PATH:$AIAK_TRAINING_PATH:$PYTHONPATH \
torchrun ${DISTRIBUTED_ARGS[@]} \
$AIAK_TRAINING_PATH/aiak_training_llm/train.py \
${MODEL_ARGS[@]} \
${DATA_ARGS[@]} \
${IMG_ARGS[@]} \
${TRAINING_ARGS[@]} \
${MODEL_PARALLEL_ARGS[@]} \
${LOGGING_ARGS[@]}
启用BCCL通信库
AIAK镜像中已集成 BCCL
通信库,默认采用NCCL
通信库,可以通过环境变量来开启 BCCL
开启方式为:export LD_LIBRARY_PATH=$BCCL_PATH:$LD_LIBRARY_PATH
;
当前版本仅支持在A800上使用BCCL,H800及其他型号的GPU暂未支持