# coding=utf-8 | |
# Copyright 2023 The HuggingFace Inc. team. All rights reserved. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
from dataclasses import dataclass, field | |
from typing import Optional | |
import torch | |
from datasets import load_dataset | |
from peft import LoraConfig | |
from transformers import ( | |
AutoModelForCausalLM, | |
AutoTokenizer, | |
BitsAndBytesConfig, | |
HfArgumentParser, | |
AutoTokenizer, | |
TrainingArguments, | |
) | |
from peft.tuners.lora import LoraLayer | |
from trl import SFTTrainer | |
######################################################################## | |
# This is a fully working simple example to use trl's RewardTrainer. | |
# | |
# This example fine-tunes any causal language model (GPT-2, GPT-Neo, etc.) | |
# by using the RewardTrainer from trl, we will leverage PEFT library to finetune | |
# adapters on the model. | |
# | |
######################################################################## | |
# Define and parse arguments. | |
@dataclass | |
class ScriptArguments: | |
""" | |
These arguments vary depending on how many GPUs you have, what their capacity and features are, and what size model you want to train. | |
""" | |
local_rank: Optional[int] = field(default=-1, metadata={"help": "Used for multi-gpu"}) | |
per_device_train_batch_size: Optional[int] = field(default=4) | |
per_device_eval_batch_size: Optional[int] = field(default=1) | |
gradient_accumulation_steps: Optional[int] = field(default=4) | |
learning_rate: Optional[float] = field(default=2e-4) | |
max_grad_norm: Optional[float] = field(default=0.3) | |
weight_decay: Optional[int] = field(default=0.001) | |
lora_alpha: Optional[int] = field(default=16) | |
lora_dropout: Optional[float] = field(default=0.1) | |
lora_r: Optional[int] = field(default=64) | |
max_seq_length: Optional[int] = field(default=512) | |
model_name: Optional[str] = field( | |
default="tiiuae/falcon-7b", | |
metadata={ | |
"help": "The model that you want to train from the Hugging Face hub. E.g. gpt2, gpt2-xl, bert, etc." | |
}, | |
) | |
dataset_name: Optional[str] = field( | |
default="timdettmers/openassistant-guanaco", | |
metadata={"help": "The preference dataset to use."}, | |
) | |
use_4bit: Optional[bool] = field( | |
default=True, | |
metadata={"help": "Activate 4bit precision base model loading"}, | |
) | |
use_nested_quant: Optional[bool] = field( | |
default=False, | |
metadata={"help": "Activate nested quantization for 4bit base models"}, | |
) | |
bnb_4bit_compute_dtype: Optional[str] = field( | |
default="float16", | |
metadata={"help": "Compute dtype for 4bit base models"}, | |
) | |
bnb_4bit_quant_type: Optional[str] = field( | |
default="nf4", | |
metadata={"help": "Quantization type fp4 or nf4"}, | |
) | |
num_train_epochs: Optional[int] = field( | |
default=1, | |
metadata={"help": "The number of training epochs for the reward model."}, | |
) | |
fp16: Optional[bool] = field( | |
default=False, | |
metadata={"help": "Enables fp16 training."}, | |
) | |
bf16: Optional[bool] = field( | |
default=False, | |
metadata={"help": "Enables bf16 training."}, | |
) | |
packing: Optional[bool] = field( | |
default=False, | |
metadata={"help": "Use packing dataset creating."}, | |
) | |
gradient_checkpointing: Optional[bool] = field( | |
default=True, | |
metadata={"help": "Enables gradient checkpointing."}, | |
) | |
optim: Optional[str] = field( | |
default="paged_adamw_32bit", | |
metadata={"help": "The optimizer to use."}, | |
) | |
lr_scheduler_type: str = field( | |
default="constant", | |
metadata={"help": "Learning rate schedule. Constant a bit better than cosine, and has advantage for analysis"}, | |
) | |
max_steps: int = field(default=10000, metadata={"help": "How many optimizer update steps to take"}) | |
warmup_ratio: float = field(default=0.03, metadata={"help": "Fraction of steps to do a warmup for"}) | |
group_by_length: bool = field( | |
default=True, | |
metadata={ | |
"help": "Group sequences into batches with same length. Saves memory and speeds up training considerably." | |
}, | |
) | |
save_steps: int = field(default=10, metadata={"help": "Save checkpoint every X updates steps."}) | |
logging_steps: int = field(default=10, metadata={"help": "Log every X updates steps."}) | |
parser = HfArgumentParser(ScriptArguments) | |
script_args = parser.parse_args_into_dataclasses()[0] | |
def create_and_prepare_model(args): | |
compute_dtype = getattr(torch, args.bnb_4bit_compute_dtype) | |
bnb_config = BitsAndBytesConfig( | |
load_in_4bit=args.use_4bit, | |
bnb_4bit_quant_type=args.bnb_4bit_quant_type, | |
bnb_4bit_compute_dtype=compute_dtype, | |
bnb_4bit_use_double_quant=args.use_nested_quant, | |
) | |
if compute_dtype == torch.float16 and args.use_4bit: | |
major, _ = torch.cuda.get_device_capability() | |
if major >= 8: | |
print("=" * 80) | |
print("Your GPU supports bfloat16, you can accelerate training with the argument --bf16") | |
print("=" * 80) | |
device_map = {"": 0} | |
model = AutoModelForCausalLM.from_pretrained( | |
args.model_name, quantization_config=bnb_config, device_map=device_map, trust_remote_code=True | |
) | |
peft_config = LoraConfig( | |
lora_alpha=script_args.lora_alpha, | |
lora_dropout=script_args.lora_dropout, | |
r=script_args.lora_r, | |
bias="none", | |
task_type="CAUSAL_LM", | |
target_modules=[ | |
"query_key_value", | |
"dense", | |
"dense_h_to_4h", | |
"dense_4h_to_h", | |
], # , "word_embeddings", "lm_head"], | |
) | |
tokenizer = AutoTokenizer.from_pretrained(script_args.model_name, trust_remote_code=True) | |
tokenizer.pad_token = tokenizer.eos_token | |
return model, peft_config, tokenizer | |
training_arguments = TrainingArguments( | |
output_dir="./results", | |
per_device_train_batch_size=script_args.per_device_train_batch_size, | |
gradient_accumulation_steps=script_args.gradient_accumulation_steps, | |
optim=script_args.optim, | |
save_steps=script_args.save_steps, | |
logging_steps=script_args.logging_steps, | |
learning_rate=script_args.learning_rate, | |
fp16=script_args.fp16, | |
bf16=script_args.bf16, | |
max_grad_norm=script_args.max_grad_norm, | |
max_steps=script_args.max_steps, | |
warmup_ratio=script_args.warmup_ratio, | |
group_by_length=script_args.group_by_length, | |
lr_scheduler_type=script_args.lr_scheduler_type, | |
) | |
model, peft_config, tokenizer = create_and_prepare_model(script_args) | |
model.config.use_cache = False | |
dataset = load_dataset(script_args.dataset_name, split="train") | |
trainer = SFTTrainer( | |
model=model, | |
train_dataset=dataset, | |
peft_config=peft_config, | |
dataset_text_field="text", | |
max_seq_length=script_args.max_seq_length, | |
tokenizer=tokenizer, | |
args=training_arguments, | |
packing=script_args.packing, | |
) | |
for name, module in trainer.model.named_modules(): | |
if isinstance(module, LoraLayer): | |
if script_args.bf16: | |
module = module.to(torch.bfloat16) | |
if "norm" in name: | |
module = module.to(torch.float32) | |
if "lm_head" in name or "embed_tokens" in name: | |
if hasattr(module, "weight"): | |
if script_args.bf16 and module.weight.dtype == torch.float32: | |
module = module.to(torch.bfloat16) | |
trainer.train() |
@imrankh46 what GPU?
@imrankh46 what GPU?
Colab free GPU
ValueError: Some specified arguments are not used by the HfArgumentParser: ['-f', '/root/.local/share/jupyter/runtime/kernel-a3d273a8-3e31-4ff4-b37c-9a51367dc60d.json']
I am having the above error.
I installed the required packages using this.
!pip install -q -U git+https://github.com/lvwerra/trl.git git+https://github.com/huggingface/transformers.git git+https://github.com/huggingface/accelerate.git git+https://github.com/huggingface/peft.git
!pip install -q datasets bitsandbytes einops wandb
out of curiosity, has anyone benchmarked or has a gut feel on the performance of mosaic-instruct 7b / 30b vs falcon-instruct 7b / 40b for any tasks? which is better?
@amnasher are you running on cpu or gpu? do you call model.to("cuda") or pass in a device anywhere? my guess is you're running on CPU :)
Yes I do assign DEVICE = "cuda:0" ? How is it running on CPU? I am using colab free.
@anshoomehra btw - I'm getting OOM while trying to run inference on an 80GB gpu with falcon-40B-instruct (which is args.model_name) based on your code snippet. below is an example, which still is OOM without the PeftModel() call and/or without passing a config into the original model loading. any idea or did i miss something in the code snippet or something? many thanks. i assume the PeftModel() call is totally unnecessary btw - i tried without it as well and still OOM.
print(f"loading the bits and btyes config + PEFT model for {args.model_name} and is_base={args.is_base}") # turn into peft model with adapters bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type='nf4', bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=True ) model = model_cls.from_pretrained( args.model_name, config=config, quantization_config=bnb_config, trust_remote_code=True, device_map="auto" ) model = PeftModel.from_pretrained( model, args.ckpt_dir )
@nshalon hmm, that's weird you should be able to load it in approx ~40G. I am attaching the full script here & a memory profile screenshot. This should help you enable Peft as well, in the last one I rush-sent a simpler version that did not have all Peft-related code. I do not see Peft reducing memory footprint beyond 4-bits BnB however the trainable parameters would come down further by certain %.
This script may help others too.
Would you mind sharing your training script? Not sure, if you sent it earlier and I missed it? (a lot of messages since then)
** With or Without Peft, ~40G profile:**
Code:
import torch
from datasets import load_dataset
from peft import LoraConfig, get_peft_model
from transformers import (
Trainer,
AutoModelForCausalLM,
AutoTokenizer,
BitsAndBytesConfig,
HfArgumentParser,
AutoTokenizer,
TrainingArguments,
)
def create_and_prepare_model(model_name="tiiuae/falcon-40b-instruct", enablePEFT=False):
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type='nf4',
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_use_double_quant=True
)
model = AutoModelForCausalLM.from_pretrained(
model_name,
quantization_config=bnb_config,
trust_remote_code=True,
device_map="auto"
)
peft_config = LoraConfig(
lora_alpha=32,
lora_dropout=0.05,
r=64,
bias="none",
task_type="CAUSAL_LM",
target_modules=[
"query_key_value",
"dense",
"dense_h_to_4h",
"dense_4h_to_h",
], # , "word_embeddings", "lm_head"],
)
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
model.gradient_checkpointing_enable()
if enablePEFT:
model = get_peft_model(model, peft_config)
return model, tokenizer
_m, _t = create_and_prepare_model("tiiuae/falcon-40b-instruct", True)
hello guys, have you tried exploiting W&B checkpoints, hard to load them no proper guidance on the site and gc flushes all the data after session is closed and i think its easy in tensorboard by loading the latest or whatever checkpoint you gonna use in inference.??
checkout this video on 40b instruct by sentdex on youtube if you are interested in finding out more about the power of 40b!!
https://www.youtube.com/watch?v=-IV1NTGy6Mg
@anshoomehra thanks! that worked... BTW bitsandbytes has some compatibility issues with H100 GPUs, curious if there's another way to load the 40B Falcon on an 80GB GPU...
I'm hesitant to put the training script on public forum since i'm working on some proprietary stuff, but if you email me I can send it to you directly (nitan@shalon.com).
But here's the DeepSpeed ZeRO config that I'm using in case that's helpful.
{
"bf16": {
"enabled": true
},
"optimizer": {
"type": "AdamW",
"params": {
"lr": "auto",
"betas": "auto",
"eps": "auto",
"weight_decay": "auto"
}
},
"zero_optimization": {
"stage": 3,
"contiguous_gradients": true,
"overlap_comm": true,
"stage3_gather_16bit_weights_on_model_save": true,
"offload_optimizer": {
"device": "cpu",
"pin_memory": true
},
"offload_param": {
"device": "cpu",
"pin_memory": true
},
"sub_group_size": 1e9,
"reduce_bucket_size": "auto",
"stage3_prefetch_bucket_size": "auto",
"stage3_param_persistence_threshold": "auto",
"stage3_max_live_parameters": 1e9,
"stage3_max_reuse_distance": 1e9
},
"gradient_accumulation_steps": 344,
"gradient_clipping": "auto",
"steps_per_print": 5,
"train_micro_batch_size_per_gpu": 1,
"train_batch_size": 1032,
"wall_clock_breakdown": false,
"scheduler": {
"type": "WarmupDecayLR",
"params": {
"total_num_steps": "auto",
"warmup_min_lr": "auto",
"warmup_max_lr": "auto",
"warmup_num_steps": "auto"
}
},
"activation_checkpointing": {
"partition_activations": false,
"cpu_checkpointing": false,
"contiguous_memory_optimization": false,
"number_checkpoints": null,
"synchronize_checkpoint_boundary": false,
"profile": false
}
}
has anyone been able to have this script run at all on something not in colab?
I keep getting a matrix mult miss match as detailed here huggingface/peft#685
Error
===================================BUG REPORT===================================
Welcome to bitsandbytes. For bug reports, please run
python -m bitsandbytes
and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
================================================================================
bin /lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda117.so
CUDA SETUP: CUDA runtime path found: /lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/libcudart.so.11.0
CUDA SETUP: Highest compute capability among GPUs detected: 8.0
CUDA SETUP: Detected CUDA version 117
CUDA SETUP: Loading binary /lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda117.so...
mode='disabled'
run=
report_to='none'
{'report_to': 'none', 'path2config': '/lfs/ampere1/0/brando9/ultimate-utils/ultimate-utils-proj-src/uutils/wandb_uu/sweep_configs/debug_config.yaml', 'program': '~/ultimate-utils/ultimate-utils-proj-src/uutils/wandb_uu/sweeps_common.py', 'project': 'playground', 'entity': 'brando', 'name': 'debug-logging-to-wandb-plataform-test', 'description': 'debug-not-logging-to-wandb-plataform-test', 'metric': {'name': 'train_loss', 'goal': 'minimize'}, 'method': 'random', 'optimizer': 'nadam', 'scheduler': 'cosine', 'lr': 0.0001, 'batch_size': 32, 'num_its': 2, 'run_cap': 1}
Found cached dataset json (/lfs/ampere1/0/brando9/.cache/huggingface/datasets/timdettmers___json/timdettmers--openassistant-guanaco-6126c710748182cf/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)
Found cached dataset json (/lfs/ampere1/0/brando9/.cache/huggingface/datasets/timdettmers___json/timdettmers--openassistant-guanaco-6126c710748182cf/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)
Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:15<00:00, 2.00s/it]
Checking for dtype of HF model.
--> Weight 'transformer.word_embeddings.weight' has datatype: torch.float16
Loading cached processed dataset at /lfs/ampere1/0/brando9/.cache/huggingface/datasets/timdettmers___json/timdettmers--openassistant-guanaco-6126c710748182cf/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-3631bcc54f6b2e69.arrow
0%| | 0/500 [00:00<?, ?it/s]You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Traceback (most recent call last):
File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/pdb.py", line 1723, in main
pdb._runscript(mainpyfile)
File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/pdb.py", line 1583, in _runscript
self.run(statement)
File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/bdb.py", line 598, in run
exec(cmd, globals, locals)
File "<string>", line 1, in <module>
File "/afs/cs.stanford.edu/u/brando9/ultimate-utils/ultimate-utils-proj-src/uutils/hf_uu/mains_hf/falcon_uu/main_falcon_uu.py", line 34, in <module>
main_falcon()
File "/afs/cs.stanford.edu/u/brando9/ultimate-utils/ultimate-utils-proj-src/uutils/hf_uu/mains_hf/falcon_uu/main_falcon_uu.py", line 21, in main_falcon
train(args)
File "/afs/cs.stanford.edu/u/brando9/ultimate-utils/ultimate-utils-proj-src/uutils/hf_uu/train/sft/qlora_ft.py", line 59, in train_falcon
trainer.train()
File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/transformers/trainer.py", line 1645, in train
return inner_training_loop(
File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/transformers/trainer.py", line 1938, in _inner_training_loop
tr_loss_step = self.training_step(model, inputs)
File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/transformers/trainer.py", line 2759, in training_step
loss = self.compute_loss(model, inputs)
File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/transformers/trainer.py", line 2784, in compute_loss
outputs = model(**inputs)
File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/accelerate/utils/operations.py", line 553, in forward
return model_forward(*args, **kwargs)
File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/accelerate/utils/operations.py", line 541, in __call__
return convert_to_fp32(self.model_forward(*args, **kwargs))
File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/torch/amp/autocast_mode.py", line 14, in decorate_autocast
return func(*args, **kwargs)
File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/peft/peft_model.py", line 678, in forward
return self.base_model(
File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/accelerate/hooks.py", line 165, in new_forward
output = old_forward(*args, **kwargs)
File "/lfs/ampere1/0/brando9/.cache/huggingface/modules/transformers_modules/tiiuae/falcon-7b/2f5c3cd4eace6be6c0f12981f377fb35e5bf6ee5/modelling_RW.py", line 753, in forward
transformer_outputs = self.transformer(
File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/accelerate/hooks.py", line 165, in new_forward
output = old_forward(*args, **kwargs)
File "/lfs/ampere1/0/brando9/.cache/huggingface/modules/transformers_modules/tiiuae/falcon-7b/2f5c3cd4eace6be6c0f12981f377fb35e5bf6ee5/modelling_RW.py", line 648, in forward
outputs = block(
File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/accelerate/hooks.py", line 165, in new_forward
output = old_forward(*args, **kwargs)
File "/lfs/ampere1/0/brando9/.cache/huggingface/modules/transformers_modules/tiiuae/falcon-7b/2f5c3cd4eace6be6c0f12981f377fb35e5bf6ee5/modelling_RW.py", line 385, in forward
attn_outputs = self.self_attention(
File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/accelerate/hooks.py", line 165, in new_forward
output = old_forward(*args, **kwargs)
File "/lfs/ampere1/0/brando9/.cache/huggingface/modules/transformers_modules/tiiuae/falcon-7b/2f5c3cd4eace6be6c0f12981f377fb35e5bf6ee5/modelling_RW.py", line 242, in forward
fused_qkv = self.query_key_value(hidden_states) # [batch_size, seq_length, 3 x hidden_size]
File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/peft/tuners/lora.py", line 565, in forward
result = F.linear(x, transpose(self.weight, self.fan_in_fan_out), bias=self.bias)
RuntimeError: mat1 and mat2 shapes cannot be multiplied (2048x4544 and 1x10614784)
system info
python collect_env.py
Collecting environment information...
PyTorch version: 2.0.1
Is debug build: False
CUDA used to build PyTorch: 11.7
ROCM used to build PyTorch: N/A
OS: Ubuntu 20.04.4 LTS (x86_64)
GCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0
Clang version: Could not collect
CMake version: version 3.26.4
Libc version: glibc-2.31
Python version: 3.10.11 (main, May 16 2023, 00:28:57) [GCC 11.2.0] (64-bit runtime)
Python platform: Linux-5.4.0-122-generic-x86_64-with-glibc2.31
Is CUDA available: True
CUDA runtime version: 11.7.64
CUDA_MODULE_LOADING set to: LAZY
GPU models and configuration:
GPU 0: NVIDIA A100-SXM4-80GB
GPU 1: NVIDIA A100-SXM4-80GB
GPU 2: NVIDIA A100-SXM4-80GB
GPU 3: NVIDIA A100-SXM4-80GB
GPU 4: NVIDIA A100-SXM4-80GB
GPU 5: NVIDIA A100-SXM4-80GB
GPU 6: NVIDIA A100-SXM4-80GB
GPU 7: NVIDIA A100-SXM4-80GB
Nvidia driver version: 515.43.04
cuDNN version: Could not collect
HIP runtime version: N/A
MIOpen runtime version: N/A
Is XNNPACK available: True
CPU:
Architecture: x86_64
CPU op-mode(s): 32-bit, 64-bit
Byte Order: Little Endian
Address sizes: 48 bits physical, 48 bits virtual
CPU(s): 128
On-line CPU(s) list: 0-127
Thread(s) per core: 2
Core(s) per socket: 32
Socket(s): 2
NUMA node(s): 2
Vendor ID: AuthenticAMD
CPU family: 25
Model: 1
Model name: AMD EPYC 7543 32-Core Processor
Stepping: 1
Frequency boost: enabled
CPU MHz: 3455.484
CPU max MHz: 2800.0000
CPU min MHz: 1500.0000
BogoMIPS: 5599.81
Virtualization: AMD-V
L1d cache: 2 MiB
L1i cache: 2 MiB
L2 cache: 32 MiB
L3 cache: 512 MiB
NUMA node0 CPU(s): 0-31,64-95
NUMA node1 CPU(s): 32-63,96-127
Vulnerability Itlb multihit: Not affected
Vulnerability L1tf: Not affected
Vulnerability Mds: Not affected
Vulnerability Meltdown: Not affected
Vulnerability Mmio stale data: Not affected
Vulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp
Vulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization
Vulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, IBRS_FW, STIBP always-on, RSB filling
Vulnerability Srbds: Not affected
Vulnerability Tsx async abort: Not affected
Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 invpcid cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold v_vmsave_vmload vgif umip pku ospke vaes vpclmulqdq rdpid overflow_recov succor smca
Versions of relevant libraries:
[pip3] numpy==1.25.0
[pip3] torch==2.0.1
[pip3] torchaudio==2.0.2
[pip3] torchvision==0.15.2
[pip3] triton==2.0.0
[conda] blas 1.0 mkl
[conda] ffmpeg 4.3 hf484d3e_0 pytorch
[conda] mkl 2023.1.0 h6d00ec8_46342
[conda] mkl-service 2.4.0 py310h5eee18b_1
[conda] mkl_fft 1.3.6 py310h1128e8f_1
[conda] mkl_random 1.2.2 py310h1128e8f_1
[conda] numpy 1.25.1 pypi_0 pypi
[conda] numpy-base 1.25.0 py310hb5e798b_0
[conda] pytorch 2.0.1 py3.10_cuda11.7_cudnn8.5.0_0 pytorch
[conda] pytorch-cuda 11.7 h778d358_5 pytorch
[conda] pytorch-mutex 1.0 cuda pytorch
[conda] torchaudio 2.0.2 py310_cu117 pytorch
[conda] torchtriton 2.0.0 py310 pytorch
[conda] torchvision 0.15.2 py310_cu117 pytorch
[conda] triton 2.0.0 pypi_0 pypi
@nshalon sorry I don't think I follow.
has anyone been able to have this script run at all on something not in colab?
I keep getting a matrix mult miss match as detailed here huggingface/peft#685
Error
===================================BUG REPORT=================================== Welcome to bitsandbytes. For bug reports, please run python -m bitsandbytes and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues ================================================================================ bin /lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda117.so CUDA SETUP: CUDA runtime path found: /lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/libcudart.so.11.0 CUDA SETUP: Highest compute capability among GPUs detected: 8.0 CUDA SETUP: Detected CUDA version 117 CUDA SETUP: Loading binary /lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda117.so... mode='disabled' run= report_to='none' {'report_to': 'none', 'path2config': '/lfs/ampere1/0/brando9/ultimate-utils/ultimate-utils-proj-src/uutils/wandb_uu/sweep_configs/debug_config.yaml', 'program': '~/ultimate-utils/ultimate-utils-proj-src/uutils/wandb_uu/sweeps_common.py', 'project': 'playground', 'entity': 'brando', 'name': 'debug-logging-to-wandb-plataform-test', 'description': 'debug-not-logging-to-wandb-plataform-test', 'metric': {'name': 'train_loss', 'goal': 'minimize'}, 'method': 'random', 'optimizer': 'nadam', 'scheduler': 'cosine', 'lr': 0.0001, 'batch_size': 32, 'num_its': 2, 'run_cap': 1} Found cached dataset json (/lfs/ampere1/0/brando9/.cache/huggingface/datasets/timdettmers___json/timdettmers--openassistant-guanaco-6126c710748182cf/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96) Found cached dataset json (/lfs/ampere1/0/brando9/.cache/huggingface/datasets/timdettmers___json/timdettmers--openassistant-guanaco-6126c710748182cf/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96) Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:15<00:00, 2.00s/it] Checking for dtype of HF model. --> Weight 'transformer.word_embeddings.weight' has datatype: torch.float16 Loading cached processed dataset at /lfs/ampere1/0/brando9/.cache/huggingface/datasets/timdettmers___json/timdettmers--openassistant-guanaco-6126c710748182cf/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-3631bcc54f6b2e69.arrow 0%| | 0/500 [00:00<?, ?it/s]You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding. Traceback (most recent call last): File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/pdb.py", line 1723, in main pdb._runscript(mainpyfile) File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/pdb.py", line 1583, in _runscript self.run(statement) File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/bdb.py", line 598, in run exec(cmd, globals, locals) File "<string>", line 1, in <module> File "/afs/cs.stanford.edu/u/brando9/ultimate-utils/ultimate-utils-proj-src/uutils/hf_uu/mains_hf/falcon_uu/main_falcon_uu.py", line 34, in <module> main_falcon() File "/afs/cs.stanford.edu/u/brando9/ultimate-utils/ultimate-utils-proj-src/uutils/hf_uu/mains_hf/falcon_uu/main_falcon_uu.py", line 21, in main_falcon train(args) File "/afs/cs.stanford.edu/u/brando9/ultimate-utils/ultimate-utils-proj-src/uutils/hf_uu/train/sft/qlora_ft.py", line 59, in train_falcon trainer.train() File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/transformers/trainer.py", line 1645, in train return inner_training_loop( File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/transformers/trainer.py", line 1938, in _inner_training_loop tr_loss_step = self.training_step(model, inputs) File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/transformers/trainer.py", line 2759, in training_step loss = self.compute_loss(model, inputs) File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/transformers/trainer.py", line 2784, in compute_loss outputs = model(**inputs) File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl return forward_call(*args, **kwargs) File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/accelerate/utils/operations.py", line 553, in forward return model_forward(*args, **kwargs) File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/accelerate/utils/operations.py", line 541, in __call__ return convert_to_fp32(self.model_forward(*args, **kwargs)) File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/torch/amp/autocast_mode.py", line 14, in decorate_autocast return func(*args, **kwargs) File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/peft/peft_model.py", line 678, in forward return self.base_model( File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl return forward_call(*args, **kwargs) File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/accelerate/hooks.py", line 165, in new_forward output = old_forward(*args, **kwargs) File "/lfs/ampere1/0/brando9/.cache/huggingface/modules/transformers_modules/tiiuae/falcon-7b/2f5c3cd4eace6be6c0f12981f377fb35e5bf6ee5/modelling_RW.py", line 753, in forward transformer_outputs = self.transformer( File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl return forward_call(*args, **kwargs) File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/accelerate/hooks.py", line 165, in new_forward output = old_forward(*args, **kwargs) File "/lfs/ampere1/0/brando9/.cache/huggingface/modules/transformers_modules/tiiuae/falcon-7b/2f5c3cd4eace6be6c0f12981f377fb35e5bf6ee5/modelling_RW.py", line 648, in forward outputs = block( File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl return forward_call(*args, **kwargs) File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/accelerate/hooks.py", line 165, in new_forward output = old_forward(*args, **kwargs) File "/lfs/ampere1/0/brando9/.cache/huggingface/modules/transformers_modules/tiiuae/falcon-7b/2f5c3cd4eace6be6c0f12981f377fb35e5bf6ee5/modelling_RW.py", line 385, in forward attn_outputs = self.self_attention( File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl return forward_call(*args, **kwargs) File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/accelerate/hooks.py", line 165, in new_forward output = old_forward(*args, **kwargs) File "/lfs/ampere1/0/brando9/.cache/huggingface/modules/transformers_modules/tiiuae/falcon-7b/2f5c3cd4eace6be6c0f12981f377fb35e5bf6ee5/modelling_RW.py", line 242, in forward fused_qkv = self.query_key_value(hidden_states) # [batch_size, seq_length, 3 x hidden_size] File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl return forward_call(*args, **kwargs) File "/lfs/ampere1/0/brando9/miniconda/envs/data_quality/lib/python3.10/site-packages/peft/tuners/lora.py", line 565, in forward result = F.linear(x, transpose(self.weight, self.fan_in_fan_out), bias=self.bias) RuntimeError: mat1 and mat2 shapes cannot be multiplied (2048x4544 and 1x10614784)
system info
python collect_env.py Collecting environment information... PyTorch version: 2.0.1 Is debug build: False CUDA used to build PyTorch: 11.7 ROCM used to build PyTorch: N/A OS: Ubuntu 20.04.4 LTS (x86_64) GCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0 Clang version: Could not collect CMake version: version 3.26.4 Libc version: glibc-2.31 Python version: 3.10.11 (main, May 16 2023, 00:28:57) [GCC 11.2.0] (64-bit runtime) Python platform: Linux-5.4.0-122-generic-x86_64-with-glibc2.31 Is CUDA available: True CUDA runtime version: 11.7.64 CUDA_MODULE_LOADING set to: LAZY GPU models and configuration: GPU 0: NVIDIA A100-SXM4-80GB GPU 1: NVIDIA A100-SXM4-80GB GPU 2: NVIDIA A100-SXM4-80GB GPU 3: NVIDIA A100-SXM4-80GB GPU 4: NVIDIA A100-SXM4-80GB GPU 5: NVIDIA A100-SXM4-80GB GPU 6: NVIDIA A100-SXM4-80GB GPU 7: NVIDIA A100-SXM4-80GB Nvidia driver version: 515.43.04 cuDNN version: Could not collect HIP runtime version: N/A MIOpen runtime version: N/A Is XNNPACK available: True CPU: Architecture: x86_64 CPU op-mode(s): 32-bit, 64-bit Byte Order: Little Endian Address sizes: 48 bits physical, 48 bits virtual CPU(s): 128 On-line CPU(s) list: 0-127 Thread(s) per core: 2 Core(s) per socket: 32 Socket(s): 2 NUMA node(s): 2 Vendor ID: AuthenticAMD CPU family: 25 Model: 1 Model name: AMD EPYC 7543 32-Core Processor Stepping: 1 Frequency boost: enabled CPU MHz: 3455.484 CPU max MHz: 2800.0000 CPU min MHz: 1500.0000 BogoMIPS: 5599.81 Virtualization: AMD-V L1d cache: 2 MiB L1i cache: 2 MiB L2 cache: 32 MiB L3 cache: 512 MiB NUMA node0 CPU(s): 0-31,64-95 NUMA node1 CPU(s): 32-63,96-127 Vulnerability Itlb multihit: Not affected Vulnerability L1tf: Not affected Vulnerability Mds: Not affected Vulnerability Meltdown: Not affected Vulnerability Mmio stale data: Not affected Vulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp Vulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization Vulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, IBRS_FW, STIBP always-on, RSB filling Vulnerability Srbds: Not affected Vulnerability Tsx async abort: Not affected Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 invpcid cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold v_vmsave_vmload vgif umip pku ospke vaes vpclmulqdq rdpid overflow_recov succor smca Versions of relevant libraries: [pip3] numpy==1.25.0 [pip3] torch==2.0.1 [pip3] torchaudio==2.0.2 [pip3] torchvision==0.15.2 [pip3] triton==2.0.0 [conda] blas 1.0 mkl [conda] ffmpeg 4.3 hf484d3e_0 pytorch [conda] mkl 2023.1.0 h6d00ec8_46342 [conda] mkl-service 2.4.0 py310h5eee18b_1 [conda] mkl_fft 1.3.6 py310h1128e8f_1 [conda] mkl_random 1.2.2 py310h1128e8f_1 [conda] numpy 1.25.1 pypi_0 pypi [conda] numpy-base 1.25.0 py310hb5e798b_0 [conda] pytorch 2.0.1 py3.10_cuda11.7_cudnn8.5.0_0 pytorch [conda] pytorch-cuda 11.7 h778d358_5 pytorch [conda] pytorch-mutex 1.0 cuda pytorch [conda] torchaudio 2.0.2 py310_cu117 pytorch [conda] torchtriton 2.0.0 py310 pytorch [conda] torchvision 0.15.2 py310_cu117 pytorch [conda] triton 2.0.0 pypi_0 pypi
Same here!
Traceback (most recent call last):
File "falcon_peft.py", line 222, in <module>
trainer.train()
File "/workspace/entity-identification/.venv/lib/python3.8/site-packages/transformers/trainer.py", line 1645, in train
return inner_training_loop(
File "/workspace/entity-identification/.venv/lib/python3.8/site-packages/transformers/trainer.py", line 1938, in _inner_training_loop
tr_loss_step = self.training_step(model, inputs)
File "/workspace/entity-identification/.venv/lib/python3.8/site-packages/transformers/trainer.py", line 2759, in training_step
loss = self.compute_loss(model, inputs)
File "/workspace/entity-identification/.venv/lib/python3.8/site-packages/transformers/trainer.py", line 2784, in compute_loss
outputs = model(**inputs)
File "/workspace/entity-identification/.venv/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/workspace/entity-identification/.venv/lib/python3.8/site-packages/torch/nn/parallel/data_parallel.py", line 171, in forward
outputs = self.parallel_apply(replicas, inputs, kwargs)
File "/workspace/entity-identification/.venv/lib/python3.8/site-packages/torch/nn/parallel/data_parallel.py", line 181, in parallel_apply
return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
File "/workspace/entity-identification/.venv/lib/python3.8/site-packages/torch/nn/parallel/parallel_apply.py", line 89, in parallel_apply
output.reraise()
File "/workspace/entity-identification/.venv/lib/python3.8/site-packages/torch/_utils.py", line 644, in reraise
raise exception
RuntimeError: Caught RuntimeError in replica 0 on device 0.
Original Traceback (most recent call last):
File "/workspace/entity-identification/.venv/lib/python3.8/site-packages/torch/nn/parallel/parallel_apply.py", line 64, in _worker
output = module(*input, **kwargs)
File "/workspace/entity-identification/.venv/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/workspace/entity-identification/.venv/lib/python3.8/site-packages/peft/peft_model.py", line 678, in forward
return self.base_model(
File "/workspace/entity-identification/.venv/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/workspace/entity-identification/.venv/lib/python3.8/site-packages/accelerate/hooks.py", line 165, in new_forward
output = old_forward(*args, **kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/tiiuae/falcon-7b/2f5c3cd4eace6be6c0f12981f377fb35e5bf6ee5/modelling_RW.py", line 753, in forward
transformer_outputs = self.transformer(
File "/workspace/entity-identification/.venv/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/workspace/entity-identification/.venv/lib/python3.8/site-packages/accelerate/hooks.py", line 165, in new_forward
output = old_forward(*args, **kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/tiiuae/falcon-7b/2f5c3cd4eace6be6c0f12981f377fb35e5bf6ee5/modelling_RW.py", line 648, in forward
outputs = block(
File "/workspace/entity-identification/.venv/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/workspace/entity-identification/.venv/lib/python3.8/site-packages/accelerate/hooks.py", line 165, in new_forward
output = old_forward(*args, **kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/tiiuae/falcon-7b/2f5c3cd4eace6be6c0f12981f377fb35e5bf6ee5/modelling_RW.py", line 385, in forward
attn_outputs = self.self_attention(
File "/workspace/entity-identification/.venv/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/workspace/entity-identification/.venv/lib/python3.8/site-packages/accelerate/hooks.py", line 165, in new_forward
output = old_forward(*args, **kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/tiiuae/falcon-7b/2f5c3cd4eace6be6c0f12981f377fb35e5bf6ee5/modelling_RW.py", line 242, in forward
fused_qkv = self.query_key_value(hidden_states) # [batch_size, seq_length, 3 x hidden_size]
File "/workspace/entity-identification/.venv/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/workspace/entity-identification/.venv/lib/python3.8/site-packages/peft/tuners/lora.py", line 565, in forward
result = F.linear(x, transpose(self.weight, self.fan_in_fan_out), bias=self.bias)
RuntimeError: mat1 and mat2 shapes cannot be multiplied (2048x4544 and 1x10614784)
@anshoomehra will share when i get to it this weekend. how did you get the 40B (base or instruct) to do inference with 36GB? i'm seeing OOM with 80GB and standard eval mode loading. thanks.
Set
load_in_8bit=True
This didn't work for me while running eval. any other suggestions?
Same weird error, even with the base script & their dataset. Anyone has a solution to this?
RuntimeError: mat1 and mat2 shapes cannot be multiplied
same here:
result = F.linear(x, transpose(self.weight, self.fan_in_fan_out), bias=self.bias)
RuntimeError: mat1 and mat2 shapes cannot be multiplied (2048x4544 and 1x10614784)
Ok its fixed. You have to upgrade peft to 0.4.0.dev0
Just curious - does anyone specify the eval_dataset for SFTTrainer ?
-
Eval set, you should be able to run without or just to test things up can even carve out a certain % from the training set.
-
Tried upgrading to peft 0.4.0.dev0 old error is gone but the new one surfaced :-)
RuntimeError: Expected to mark a variable ready only once. This error is caused by one of the following reasons: 1) Use of a module parameter outside the forward
function. Please make sure model parameters are not shared across multiple concurrent forward-backward passes. or try to use _set_static_graph() as a workaround if this module graph does not change during training loop.2) Reused parameters in multiple reentrant backward passes. For example, if you use multiple checkpoint
functions to wrap the same part of your model, it would result in the same set of parameters been used by different reentrant backward passes multiple times, and hence marking a variable ready multiple times. DDP does not support such use cases in default. You can try to use _set_static_graph() as a workaround if your module graph does not change over iterations.
Parameter at index 255 has been marked as ready twice. This means that multiple autograd engine hooks have fired for this particular parameter during this iteration. You can set the environment variable TORCH_DISTRIBUTED_DEBUG to either INFO or DETAIL to print parameter names for further debugging.
Does the above code work with multiple GPUs?
You will need accelerate or deepspeed.
I'm not familiar with accelerate, could you tell me more about how I would implement it? Also how can I use my own dataset (i.e. if it is a JSON file) with the code above?
ValueError: Some specified arguments are not used by the HfArgumentParser: ['-f', '/root/.local/share/jupyter/runtime/kernel-a3d273a8-3e31-4ff4-b37c-9a51367dc60d.json']
I am having the above error.
I installed the required packages using this.
!pip install -q -U git+https://github.com/lvwerra/trl.git git+https://github.com/huggingface/transformers.git git+https://github.com/huggingface/accelerate.git git+https://github.com/huggingface/peft.git !pip install -q datasets bitsandbytes einops wandb
I think the problem is with jupyter adding the -f
argument to sys.argv automatically when you run a cell.
You can fix it by parsing args=[]
to the parser.parse_args_into_dataclasses
, which will override the command line arguments.
i.e. try changing this line:
script_args = parser.parse_args_into_dataclasses(args=[])
How do I continue from a specific checkpoint to fine-tune with different data?
I tried to use the same script as for training,
def create_and_prepare_model():
compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=compute_dtype,
bnb_4bit_use_double_quant=True,
)
tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-7b", trust_remote_code=True)
tokenizer.add_special_tokens({'pad_token': '[PAD]', 'eos_token': '[EOS]'})
model = AutoModelForCausalLM.from_pretrained( "tiiuae/falcon-7b", quantization_config=bnb_config, device_map="auto", trust_remote_code=True)
model = PeftModel.from_pretrained(model, <Path to adapter_config.json>)
peft_config = LoraConfig(
lora_alpha=16,
lora_dropout=0.1,
r=64,
bias="none",
task_type="CAUSAL_LM",
target_modules=[
"query_key_value"
],
)
model.resize_token_embeddings(len(tokenizer))
return model, peft_config, tokenizer
ERROR:
Traceback (most recent call last):
File "/opt/conda/lib/python3.10/site-packages/peft/utils/config.py", line 177, in _get_peft_type
config_file = hf_hub_download(
File "/opt/conda/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py", line 110, in _inner_fn
validate_repo_id(arg_value)
File "/opt/conda/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py", line 158, in validate_repo_id
raise HFValidationError(
huggingface_hub.utils._validators.HFValidationError: Repo id must be in the form 'repo_name' or 'namespace/repo_name': '/home/ec2-user/SageMaker/Models/Model_10_multinews/checkpoint-13000/'. Use repo_type
argument if needed.
After upgrading all packages I am getting this error also
File "/root/.cache/huggingface/modules/transformers_modules/tiiuae/falcon-7b/378337427557d1df3e742264a2901a49f25d4eb1/modelling_RW.py", line 93, in forward
return (q * cos) + (rotate_half(q) * sin), (k * cos) + (rotate_half(k) * sin)
TypeError: unsupported operand type(s) for : 'Tensor' and 'NoneType'*
My code runs but I am getting a spiky loss chart that looks like this:
Does anyone know why this is happening/what hyperparameters I should be tuning? I am running this command, mentioned by @pacman100:
python train.py \
--model_name tiiuae/falcon-40b \
--max_seq_len 2048 \
--bf16 \
--group_by_length \
--bnb_4bit_compute_dtype bfloat16 \
--max_steps 200
EDIT: seems like trl is causing this issue, forcing a pip install to trl==0.4.6
fixes it.
Any of you running into issues with models generating too much text? Like, my models aren't stopping the generation, it doesn't know when to stop.. Does line 173 tokenizer.pad_token = tokenizer.eos_token
actually help? Or do I need to append the eos_token at the end of each training line?
{"text": "You are an AI assistant that follows instruction extremely well. Help as much as you can.\nUSER: Read the following paragraph and extract the answer for the question: What is the last name of the person who composed seven operas in three years at Brunswick? Handel joined the Hamburg opera house when it was experiencing a period of considerable artistic success. This blossoming followed the arrival of Reinhard Keiser, who had become musical director at the G\u00e4nsemarkt in about 1697, and in 1703 succeeded Johann Kusser as the theatre's manager. Born in 1674, Keiser had studied under Johann Schelle and probably Johann Kuhnau at the Thomasschule zu Leipzig. In 1694 he was employed as a court composer at Brunswick, where in three years he composed seven operas, at least one of which (Mahumeth) was performed in Hamburg. According to Handel's biographer Donald Burrows, Keiser was a good judge of popular taste, with a flair for writing Italian-style arias. Between 1697 and 1703, prior to Handel's arrival, about a dozen more Keiser operas had been staged at the G\u00e4nsemarkt. Despite his on-stage successes, Keiser was an unreliable general manager, with expensive private tastes and little financial acumen, often at odds with his creditors.It is possible that Keiser, who had connections in the Halle area, had heard of Handel and was directly instrumental in securing the latter's post in the G\u00e4nsemarkt orchestra; certainly he was a considerable influence on the younger man in the three years that Handel spent in Hamburg. Another important G\u00e4nsemarkt colleague was the house composer and singer Johann Mattheson, who noted Handel's rapid progress in the orchestra from back-desk violinist to harpsichord soloist, a role in which, said Mattheson, \"he showed himself a man\u2014a thing which no one had before suspected, save I alone\". Mattheson was less complimentary on Handel's early efforts at composition: \"He composed very long, long arias, and really interminable cantatas\", before, it seems, \"the lofty schooling of opera ... trimmed him into other fashions\":\nASSISTANT: Keiser\n"}
Do I need to append this with </s>
(for LLaMA)? Or do I need to append the line with "USER:"
so the model adds that term at the end of each generated response, so I can strip the text before it?
does anyone know how can I load model from checkpoint to continue training? I've trained it with my dataset, now I want to pick it up from the checkpoint and continue the training but using the guanaco dataset on top. Is it possible?
@rogeriochaves did you ever figure out if this was possible?
GM
I'm running into memory issues when training. Not sure if this is due to my graphics card not being powerful enough?
if someone could point a noob like me to what I'm doing wrong, that would be much appreciated.
OutOfMemoryError: CUDA out of memory. Tried to allocate 72.00 MiB. GPU 0 has a total capacty of 5.80 GiB of which 47.69 MiB is free. Including non-PyTorch memory, this process has 5.74 GiB memory in use. Of the allocated memory 5.55 GiB is allocated by PyTorch, and 54.36 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.129.03 Driver Version: 535.129.03 CUDA Version: 12.2 |
|-----------------------------------------+----------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+======================+======================|
| 0 NVIDIA GeForce RTX 3060 ... Off | 00000000:01:00.0 Off | N/A |
| N/A 41C P8 10W / 80W | 5890MiB / 6144MiB | 0% Default |
| | | N/A |
+-----------------------------------------+----------------------+----------------------+
+---------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=======================================================================================|
| 0 N/A N/A 1348 G /usr/lib/Xorg 4MiB |
| 0 N/A N/A 3446 C /home/mercurius/LLMs/llmenv/bin/python 5876MiB |
+---------------------------------------------------------------------------------------+
I am using the following code.