maziyarpanahi/dolphin-2.9.3-mistral-nemo.yml

## dolphin-2.9.3-mistral-nemo.yml
base_model: /workspace/models/Mistral-Nemo-Base-2407
model_type: AutoModelForCausalLM
tokenizer_type: AutoTokenizer

load_in_8bit: false
# load_in_4bit: true
strict: false

datasets:
  - path: /workspace/datasets/dolphin-2.9.3/dolphin201-sharegpt2.jsonl
    type: sharegpt
    conversation: chatml
  - path: /workspace/datasets/dolphin-2.9.3/SystemChat_filtered_sharegpt.jsonl
    type: sharegpt
    conversation: chatml
  - path: /workspace/datasets/dolphin-2.9.3/SystemChat_multilingual_sharegpt.jsonl
    type: sharegpt
    conversation: chatml
  - path: /workspace/datasets/dolphin-2.9.3/dolphin-coder-translate-sharegpt2.jsonl
    type: sharegpt
    conversation: chatml
  - path: /workspace/datasets/dolphin-2.9.3/dolphin-coder-codegen-sharegpt2.jsonl
    type: sharegpt
    conversation: chatml
  - path: /workspace/datasets/dolphin-2.9.3/m-a-p_Code-Feedback-sharegpt-unfiltered.jsonl
    type: sharegpt
    conversation: chatml
  - path: /workspace/datasets/dolphin-2.9.3/m-a-p_CodeFeedback-Filtered-Instruction-sharegpt-unfiltered.jsonl
    type: sharegpt
    conversation: chatml
  - path: /workspace/datasets/dolphin-2.9.3/not_samantha_norefusals.jsonl
    type: sharegpt
    conversation: chatml
  - path: /workspace/datasets/dolphin-2.9.3/Orca-Math-resort-unfiltered.jsonl
    type: sharegpt
    conversation: chatml
  - path: /workspace/datasets/dolphin-2.9.3/agent_instruct_react_unfiltered.jsonl
    type: sharegpt
    conversation: chatml
  - path: /workspace/datasets/dolphin-2.9.3/toolbench_instruct_j1s1_3k_unfiltered.jsonl
    type: sharegpt
    conversation: chatml
  - path: /workspace/datasets/dolphin-2.9.3/toolbench_negative_unfiltered.jsonl
    type: sharegpt
    conversation: chatml
  - path: /workspace/datasets/dolphin-2.9.3/toolbench_react_10p_unfiltered.jsonl
    type: sharegpt
    conversation: chatml
  - path: /workspace/datasets/dolphin-2.9.3/toolbench_tflan_cot_30p_unfiltered.jsonl
    type: sharegpt
    conversation: chatml
  - path: /workspace/datasets/dolphin-2.9.3/openhermes200k_unfiltered.jsonl
    type: sharegpt
    conversation: chatml

chat_template: chatml
# adapter: qlora
# lora_r: 128
# lora_alpha: 16
# lora_modules_to_save: [embed_tokens, lm_head]
# lora_dropout: 0.05
# lora_target_linear: true


unfrozen_parameters:
- ^lm_head.weight$
- ^model.embed_tokens.weight$
- input_layernorm
- model.norm
- post_attention_layernorm
- self_attn.rotary_emb
# mlp.down_proj layers
- model.layers.0.mlp.down_proj
- model.layers.1.mlp.down_proj
- model.layers.4.mlp.down_proj
- model.layers.37.mlp.down_proj
- model.layers.24.mlp.down_proj
- model.layers.2.mlp.down_proj
- model.layers.38.mlp.down_proj
- model.layers.35.mlp.down_proj
- model.layers.25.mlp.down_proj
- model.layers.6.mlp.down_proj
- model.layers.22.mlp.down_proj
- model.layers.23.mlp.down_proj
- model.layers.3.mlp.down_proj
- model.layers.21.mlp.down_proj
- model.layers.5.mlp.down_proj
- model.layers.28.mlp.down_proj
- model.layers.20.mlp.down_proj
- model.layers.26.mlp.down_proj
- model.layers.19.mlp.down_proj
- model.layers.34.mlp.down_proj
# mlp.gate_proj layers
- model.layers.2.mlp.gate_proj
- model.layers.1.mlp.gate_proj
- model.layers.3.mlp.gate_proj
- model.layers.5.mlp.gate_proj
- model.layers.4.mlp.gate_proj
- model.layers.35.mlp.gate_proj
- model.layers.36.mlp.gate_proj
- model.layers.37.mlp.gate_proj
- model.layers.38.mlp.gate_proj
- model.layers.34.mlp.gate_proj
- model.layers.33.mlp.gate_proj
- model.layers.8.mlp.gate_proj
- model.layers.32.mlp.gate_proj
- model.layers.6.mlp.gate_proj
- model.layers.28.mlp.gate_proj
- model.layers.26.mlp.gate_proj
- model.layers.30.mlp.gate_proj
- model.layers.23.mlp.gate_proj
- model.layers.29.mlp.gate_proj
- model.layers.27.mlp.gate_proj
# mlp.up_proj layers
- model.layers.3.mlp.up_proj
- model.layers.4.mlp.up_proj
- model.layers.6.mlp.up_proj
- model.layers.2.mlp.up_proj
- model.layers.5.mlp.up_proj
- model.layers.8.mlp.up_proj
- model.layers.10.mlp.up_proj
- model.layers.9.mlp.up_proj
- model.layers.7.mlp.up_proj
- model.layers.0.mlp.up_proj
- model.layers.17.mlp.up_proj
- model.layers.15.mlp.up_proj
- model.layers.22.mlp.up_proj
- model.layers.18.mlp.up_proj
- model.layers.16.mlp.up_proj
- model.layers.11.mlp.up_proj
- model.layers.21.mlp.up_proj
- model.layers.23.mlp.up_proj
- model.layers.20.mlp.up_proj
- model.layers.27.mlp.up_proj
# self_attn.k_proj layers
- model.layers.30.self_attn.k_proj
- model.layers.27.self_attn.k_proj
- model.layers.25.self_attn.k_proj
- model.layers.33.self_attn.k_proj
- model.layers.26.self_attn.k_proj
- model.layers.31.self_attn.k_proj
- model.layers.35.self_attn.k_proj
- model.layers.39.self_attn.k_proj
- model.layers.22.self_attn.k_proj
- model.layers.24.self_attn.k_proj
- model.layers.21.self_attn.k_proj
- model.layers.28.self_attn.k_proj
- model.layers.23.self_attn.k_proj
- model.layers.36.self_attn.k_proj
- model.layers.20.self_attn.k_proj
- model.layers.37.self_attn.k_proj
- model.layers.29.self_attn.k_proj
- model.layers.32.self_attn.k_proj
- model.layers.16.self_attn.k_proj
- model.layers.18.self_attn.k_proj
# self_attn.o_proj layers
- model.layers.7.self_attn.o_proj
- model.layers.6.self_attn.o_proj
- model.layers.9.self_attn.o_proj
- model.layers.5.self_attn.o_proj
- model.layers.27.self_attn.o_proj
- model.layers.26.self_attn.o_proj
- model.layers.4.self_attn.o_proj
- model.layers.31.self_attn.o_proj
- model.layers.8.self_attn.o_proj
- model.layers.16.self_attn.o_proj
- model.layers.3.self_attn.o_proj
- model.layers.10.self_attn.o_proj
- model.layers.18.self_attn.o_proj
- model.layers.33.self_attn.o_proj
- model.layers.17.self_attn.o_proj
- model.layers.32.self_attn.o_proj
- model.layers.30.self_attn.o_proj
- model.layers.2.self_attn.o_proj
- model.layers.15.self_attn.o_proj
- model.layers.11.self_attn.o_proj
# self_attn.q_proj layers
- model.layers.14.self_attn.q_proj
- model.layers.11.self_attn.q_proj
- model.layers.15.self_attn.q_proj
- model.layers.9.self_attn.q_proj
- model.layers.8.self_attn.q_proj
- model.layers.18.self_attn.q_proj
- model.layers.12.self_attn.q_proj
- model.layers.13.self_attn.q_proj
- model.layers.19.self_attn.q_proj
- model.layers.16.self_attn.q_proj
- model.layers.10.self_attn.q_proj
- model.layers.17.self_attn.q_proj
- model.layers.7.self_attn.q_proj
- model.layers.5.self_attn.q_proj
- model.layers.20.self_attn.q_proj
- model.layers.3.self_attn.q_proj
- model.layers.26.self_attn.q_proj
- model.layers.27.self_attn.q_proj
- model.layers.28.self_attn.q_proj
- model.layers.33.self_attn.q_proj
# self_attn.v_proj layers
- model.layers.27.self_attn.v_proj
- model.layers.20.self_attn.v_proj
- model.layers.24.self_attn.v_proj
- model.layers.25.self_attn.v_proj
- model.layers.30.self_attn.v_proj
- model.layers.2.self_attn.v_proj
- model.layers.23.self_attn.v_proj
- model.layers.22.self_attn.v_proj
- model.layers.26.self_attn.v_proj
- model.layers.33.self_attn.v_proj
- model.layers.37.self_attn.v_proj
- model.layers.7.self_attn.v_proj
- model.layers.4.self_attn.v_proj
- model.layers.18.self_attn.v_proj
- model.layers.31.self_attn.v_proj
- model.layers.17.self_attn.v_proj
- model.layers.35.self_attn.v_proj
- model.layers.32.self_attn.v_proj
- model.layers.21.self_attn.v_proj
- model.layers.3.self_attn.v_proj


dataset_prepared_path:  /workspace/axolotl/dolph-2.9.3-nemo-prepared
val_set_size: 0.01
output_dir: /workspace/axolotl/dolphin-2.9.3-mistral-nemo

sequence_len: 8192
sample_packing: true
pad_to_sequence_len: true

wandb_project: dolphin-2.9.3-Mistral-nemo
wandb_watch:
wandb_run_id:
wandb_log_model:

gradient_accumulation_steps: 16
micro_batch_size: 1
num_epochs: 3
optimizer: adamw_torch
lr_scheduler: cosine
learning_rate: 5e-6
train_on_inputs: false
group_by_length: false
bf16: auto
fp16:
tf32:

gradient_checkpointing: true
gradient_checkpointing_kwargs:
  use_reentrant: false
early_stopping_patience:
resume_from_checkpoint:
logging_steps: 1
xformers_attention:
flash_attention: true

warmup_steps: 100
# evals_per_epoch: 4
eval_table_size:
saves_per_epoch: 1
save_total_limit: 2
save_steps:
debug:
deepspeed: deepspeed_configs/zero3_bf16.json
weight_decay: 0.1
special_tokens:
  eos_token: "<|im_end|>"
  pad_token: "<pad>"
  bos_token: "<s>"
  unk_token: "<unk>"
tokens:
  - "<|im_start|>"


# fsdp:
#   - full_shard
#   - auto_wrap
# fsdp_config:
#   fsdp_limit_all_gathers: true
#   fsdp_sync_module_states: true
#   fsdp_offload_params: true
#   fsdp_use_orig_params: false
#   fsdp_cpu_ram_efficient_loading: true
#   fsdp_transformer_layer_cls_to_wrap: MixtralSparseMoeBlock
#   fsdp_state_dict_type: FULL_STATE_DICT
#   fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
#   fsdp_sharding_strategy: FULL_SHARD
#   fsdp_forward_prefetch: false
#   fsdp_backward_prefetch: BACKWARD_PRE
	base_model: /workspace/models/Mistral-Nemo-Base-2407
	model_type: AutoModelForCausalLM
	tokenizer_type: AutoTokenizer

	load_in_8bit: false
	# load_in_4bit: true
	strict: false

	datasets:
	- path: /workspace/datasets/dolphin-2.9.3/dolphin201-sharegpt2.jsonl
	type: sharegpt
	conversation: chatml
	- path: /workspace/datasets/dolphin-2.9.3/SystemChat_filtered_sharegpt.jsonl
	type: sharegpt
	conversation: chatml
	- path: /workspace/datasets/dolphin-2.9.3/SystemChat_multilingual_sharegpt.jsonl
	type: sharegpt
	conversation: chatml
	- path: /workspace/datasets/dolphin-2.9.3/dolphin-coder-translate-sharegpt2.jsonl
	type: sharegpt
	conversation: chatml
	- path: /workspace/datasets/dolphin-2.9.3/dolphin-coder-codegen-sharegpt2.jsonl
	type: sharegpt
	conversation: chatml
	- path: /workspace/datasets/dolphin-2.9.3/m-a-p_Code-Feedback-sharegpt-unfiltered.jsonl
	type: sharegpt
	conversation: chatml
	- path: /workspace/datasets/dolphin-2.9.3/m-a-p_CodeFeedback-Filtered-Instruction-sharegpt-unfiltered.jsonl
	type: sharegpt
	conversation: chatml
	- path: /workspace/datasets/dolphin-2.9.3/not_samantha_norefusals.jsonl
	type: sharegpt
	conversation: chatml
	- path: /workspace/datasets/dolphin-2.9.3/Orca-Math-resort-unfiltered.jsonl
	type: sharegpt
	conversation: chatml
	- path: /workspace/datasets/dolphin-2.9.3/agent_instruct_react_unfiltered.jsonl
	type: sharegpt
	conversation: chatml
	- path: /workspace/datasets/dolphin-2.9.3/toolbench_instruct_j1s1_3k_unfiltered.jsonl
	type: sharegpt
	conversation: chatml
	- path: /workspace/datasets/dolphin-2.9.3/toolbench_negative_unfiltered.jsonl
	type: sharegpt
	conversation: chatml
	- path: /workspace/datasets/dolphin-2.9.3/toolbench_react_10p_unfiltered.jsonl
	type: sharegpt
	conversation: chatml
	- path: /workspace/datasets/dolphin-2.9.3/toolbench_tflan_cot_30p_unfiltered.jsonl
	type: sharegpt
	conversation: chatml
	- path: /workspace/datasets/dolphin-2.9.3/openhermes200k_unfiltered.jsonl
	type: sharegpt
	conversation: chatml

	chat_template: chatml
	# adapter: qlora
	# lora_r: 128
	# lora_alpha: 16
	# lora_modules_to_save: [embed_tokens, lm_head]
	# lora_dropout: 0.05
	# lora_target_linear: true


	unfrozen_parameters:
	- ^lm_head.weight$
	- ^model.embed_tokens.weight$
	- input_layernorm
	- model.norm
	- post_attention_layernorm
	- self_attn.rotary_emb
	# mlp.down_proj layers
	- model.layers.0.mlp.down_proj
	- model.layers.1.mlp.down_proj
	- model.layers.4.mlp.down_proj
	- model.layers.37.mlp.down_proj
	- model.layers.24.mlp.down_proj
	- model.layers.2.mlp.down_proj
	- model.layers.38.mlp.down_proj
	- model.layers.35.mlp.down_proj
	- model.layers.25.mlp.down_proj
	- model.layers.6.mlp.down_proj
	- model.layers.22.mlp.down_proj
	- model.layers.23.mlp.down_proj
	- model.layers.3.mlp.down_proj
	- model.layers.21.mlp.down_proj
	- model.layers.5.mlp.down_proj
	- model.layers.28.mlp.down_proj
	- model.layers.20.mlp.down_proj
	- model.layers.26.mlp.down_proj
	- model.layers.19.mlp.down_proj
	- model.layers.34.mlp.down_proj
	# mlp.gate_proj layers
	- model.layers.2.mlp.gate_proj
	- model.layers.1.mlp.gate_proj
	- model.layers.3.mlp.gate_proj
	- model.layers.5.mlp.gate_proj
	- model.layers.4.mlp.gate_proj
	- model.layers.35.mlp.gate_proj
	- model.layers.36.mlp.gate_proj
	- model.layers.37.mlp.gate_proj
	- model.layers.38.mlp.gate_proj
	- model.layers.34.mlp.gate_proj
	- model.layers.33.mlp.gate_proj
	- model.layers.8.mlp.gate_proj
	- model.layers.32.mlp.gate_proj
	- model.layers.6.mlp.gate_proj
	- model.layers.28.mlp.gate_proj
	- model.layers.26.mlp.gate_proj
	- model.layers.30.mlp.gate_proj
	- model.layers.23.mlp.gate_proj
	- model.layers.29.mlp.gate_proj
	- model.layers.27.mlp.gate_proj
	# mlp.up_proj layers
	- model.layers.3.mlp.up_proj
	- model.layers.4.mlp.up_proj
	- model.layers.6.mlp.up_proj
	- model.layers.2.mlp.up_proj
	- model.layers.5.mlp.up_proj
	- model.layers.8.mlp.up_proj
	- model.layers.10.mlp.up_proj
	- model.layers.9.mlp.up_proj
	- model.layers.7.mlp.up_proj
	- model.layers.0.mlp.up_proj
	- model.layers.17.mlp.up_proj
	- model.layers.15.mlp.up_proj
	- model.layers.22.mlp.up_proj
	- model.layers.18.mlp.up_proj
	- model.layers.16.mlp.up_proj
	- model.layers.11.mlp.up_proj
	- model.layers.21.mlp.up_proj
	- model.layers.23.mlp.up_proj
	- model.layers.20.mlp.up_proj
	- model.layers.27.mlp.up_proj
	# self_attn.k_proj layers
	- model.layers.30.self_attn.k_proj
	- model.layers.27.self_attn.k_proj
	- model.layers.25.self_attn.k_proj
	- model.layers.33.self_attn.k_proj
	- model.layers.26.self_attn.k_proj
	- model.layers.31.self_attn.k_proj
	- model.layers.35.self_attn.k_proj
	- model.layers.39.self_attn.k_proj
	- model.layers.22.self_attn.k_proj
	- model.layers.24.self_attn.k_proj
	- model.layers.21.self_attn.k_proj
	- model.layers.28.self_attn.k_proj
	- model.layers.23.self_attn.k_proj
	- model.layers.36.self_attn.k_proj
	- model.layers.20.self_attn.k_proj
	- model.layers.37.self_attn.k_proj
	- model.layers.29.self_attn.k_proj
	- model.layers.32.self_attn.k_proj
	- model.layers.16.self_attn.k_proj
	- model.layers.18.self_attn.k_proj
	# self_attn.o_proj layers
	- model.layers.7.self_attn.o_proj
	- model.layers.6.self_attn.o_proj
	- model.layers.9.self_attn.o_proj
	- model.layers.5.self_attn.o_proj
	- model.layers.27.self_attn.o_proj
	- model.layers.26.self_attn.o_proj
	- model.layers.4.self_attn.o_proj
	- model.layers.31.self_attn.o_proj
	- model.layers.8.self_attn.o_proj
	- model.layers.16.self_attn.o_proj
	- model.layers.3.self_attn.o_proj
	- model.layers.10.self_attn.o_proj
	- model.layers.18.self_attn.o_proj
	- model.layers.33.self_attn.o_proj
	- model.layers.17.self_attn.o_proj
	- model.layers.32.self_attn.o_proj
	- model.layers.30.self_attn.o_proj
	- model.layers.2.self_attn.o_proj
	- model.layers.15.self_attn.o_proj
	- model.layers.11.self_attn.o_proj
	# self_attn.q_proj layers
	- model.layers.14.self_attn.q_proj
	- model.layers.11.self_attn.q_proj
	- model.layers.15.self_attn.q_proj
	- model.layers.9.self_attn.q_proj
	- model.layers.8.self_attn.q_proj
	- model.layers.18.self_attn.q_proj
	- model.layers.12.self_attn.q_proj
	- model.layers.13.self_attn.q_proj
	- model.layers.19.self_attn.q_proj
	- model.layers.16.self_attn.q_proj
	- model.layers.10.self_attn.q_proj
	- model.layers.17.self_attn.q_proj
	- model.layers.7.self_attn.q_proj
	- model.layers.5.self_attn.q_proj
	- model.layers.20.self_attn.q_proj
	- model.layers.3.self_attn.q_proj
	- model.layers.26.self_attn.q_proj
	- model.layers.27.self_attn.q_proj
	- model.layers.28.self_attn.q_proj
	- model.layers.33.self_attn.q_proj
	# self_attn.v_proj layers
	- model.layers.27.self_attn.v_proj
	- model.layers.20.self_attn.v_proj
	- model.layers.24.self_attn.v_proj
	- model.layers.25.self_attn.v_proj
	- model.layers.30.self_attn.v_proj
	- model.layers.2.self_attn.v_proj
	- model.layers.23.self_attn.v_proj
	- model.layers.22.self_attn.v_proj
	- model.layers.26.self_attn.v_proj
	- model.layers.33.self_attn.v_proj
	- model.layers.37.self_attn.v_proj
	- model.layers.7.self_attn.v_proj
	- model.layers.4.self_attn.v_proj
	- model.layers.18.self_attn.v_proj
	- model.layers.31.self_attn.v_proj
	- model.layers.17.self_attn.v_proj
	- model.layers.35.self_attn.v_proj
	- model.layers.32.self_attn.v_proj
	- model.layers.21.self_attn.v_proj
	- model.layers.3.self_attn.v_proj



	dataset_prepared_path: /workspace/axolotl/dolph-2.9.3-nemo-prepared
	val_set_size: 0.01
	output_dir: /workspace/axolotl/dolphin-2.9.3-mistral-nemo

	sequence_len: 8192
	sample_packing: true
	pad_to_sequence_len: true

	wandb_project: dolphin-2.9.3-Mistral-nemo
	wandb_watch:
	wandb_run_id:
	wandb_log_model:

	gradient_accumulation_steps: 16
	micro_batch_size: 1
	num_epochs: 3
	optimizer: adamw_torch
	lr_scheduler: cosine
	learning_rate: 5e-6
	train_on_inputs: false
	group_by_length: false
	bf16: auto
	fp16:
	tf32:

	gradient_checkpointing: true
	gradient_checkpointing_kwargs:
	use_reentrant: false
	early_stopping_patience:
	resume_from_checkpoint:
	logging_steps: 1
	xformers_attention:
	flash_attention: true

	warmup_steps: 100
	# evals_per_epoch: 4
	eval_table_size:
	saves_per_epoch: 1
	save_total_limit: 2
	save_steps:
	debug:
	deepspeed: deepspeed_configs/zero3_bf16.json
	weight_decay: 0.1
	special_tokens:
	eos_token: "<\|im_end\|>"
	pad_token: "<pad>"
	bos_token: "<s>"
	unk_token: "<unk>"
	tokens:
	- "<\|im_start\|>"


	# fsdp:
	# - full_shard
	# - auto_wrap
	# fsdp_config:
	# fsdp_limit_all_gathers: true
	# fsdp_sync_module_states: true
	# fsdp_offload_params: true
	# fsdp_use_orig_params: false
	# fsdp_cpu_ram_efficient_loading: true
	# fsdp_transformer_layer_cls_to_wrap: MixtralSparseMoeBlock
	# fsdp_state_dict_type: FULL_STATE_DICT
	# fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
	# fsdp_sharding_strategy: FULL_SHARD
	# fsdp_forward_prefetch: false
	# fsdp_backward_prefetch: BACKWARD_PRE