Skip to content

Instantly share code, notes, and snippets.

@pacman100
Created June 1, 2023 22:26
Show Gist options
  • Save pacman100/1731b41f7a90a87b457e8c5415ff1c14 to your computer and use it in GitHub Desktop.
Save pacman100/1731b41f7a90a87b457e8c5415ff1c14 to your computer and use it in GitHub Desktop.
# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from dataclasses import dataclass, field
from typing import Optional
import torch
from datasets import load_dataset
from peft import LoraConfig
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
BitsAndBytesConfig,
HfArgumentParser,
AutoTokenizer,
TrainingArguments,
)
from peft.tuners.lora import LoraLayer
from trl import SFTTrainer
########################################################################
# This is a fully working simple example to use trl's RewardTrainer.
#
# This example fine-tunes any causal language model (GPT-2, GPT-Neo, etc.)
# by using the RewardTrainer from trl, we will leverage PEFT library to finetune
# adapters on the model.
#
########################################################################
# Define and parse arguments.
@dataclass
class ScriptArguments:
"""
These arguments vary depending on how many GPUs you have, what their capacity and features are, and what size model you want to train.
"""
local_rank: Optional[int] = field(default=-1, metadata={"help": "Used for multi-gpu"})
per_device_train_batch_size: Optional[int] = field(default=4)
per_device_eval_batch_size: Optional[int] = field(default=1)
gradient_accumulation_steps: Optional[int] = field(default=4)
learning_rate: Optional[float] = field(default=2e-4)
max_grad_norm: Optional[float] = field(default=0.3)
weight_decay: Optional[int] = field(default=0.001)
lora_alpha: Optional[int] = field(default=16)
lora_dropout: Optional[float] = field(default=0.1)
lora_r: Optional[int] = field(default=64)
max_seq_length: Optional[int] = field(default=512)
model_name: Optional[str] = field(
default="tiiuae/falcon-7b",
metadata={
"help": "The model that you want to train from the Hugging Face hub. E.g. gpt2, gpt2-xl, bert, etc."
},
)
dataset_name: Optional[str] = field(
default="timdettmers/openassistant-guanaco",
metadata={"help": "The preference dataset to use."},
)
use_4bit: Optional[bool] = field(
default=True,
metadata={"help": "Activate 4bit precision base model loading"},
)
use_nested_quant: Optional[bool] = field(
default=False,
metadata={"help": "Activate nested quantization for 4bit base models"},
)
bnb_4bit_compute_dtype: Optional[str] = field(
default="float16",
metadata={"help": "Compute dtype for 4bit base models"},
)
bnb_4bit_quant_type: Optional[str] = field(
default="nf4",
metadata={"help": "Quantization type fp4 or nf4"},
)
num_train_epochs: Optional[int] = field(
default=1,
metadata={"help": "The number of training epochs for the reward model."},
)
fp16: Optional[bool] = field(
default=False,
metadata={"help": "Enables fp16 training."},
)
bf16: Optional[bool] = field(
default=False,
metadata={"help": "Enables bf16 training."},
)
packing: Optional[bool] = field(
default=False,
metadata={"help": "Use packing dataset creating."},
)
gradient_checkpointing: Optional[bool] = field(
default=True,
metadata={"help": "Enables gradient checkpointing."},
)
optim: Optional[str] = field(
default="paged_adamw_32bit",
metadata={"help": "The optimizer to use."},
)
lr_scheduler_type: str = field(
default="constant",
metadata={"help": "Learning rate schedule. Constant a bit better than cosine, and has advantage for analysis"},
)
max_steps: int = field(default=10000, metadata={"help": "How many optimizer update steps to take"})
warmup_ratio: float = field(default=0.03, metadata={"help": "Fraction of steps to do a warmup for"})
group_by_length: bool = field(
default=True,
metadata={
"help": "Group sequences into batches with same length. Saves memory and speeds up training considerably."
},
)
save_steps: int = field(default=10, metadata={"help": "Save checkpoint every X updates steps."})
logging_steps: int = field(default=10, metadata={"help": "Log every X updates steps."})
parser = HfArgumentParser(ScriptArguments)
script_args = parser.parse_args_into_dataclasses()[0]
def create_and_prepare_model(args):
compute_dtype = getattr(torch, args.bnb_4bit_compute_dtype)
bnb_config = BitsAndBytesConfig(
load_in_4bit=args.use_4bit,
bnb_4bit_quant_type=args.bnb_4bit_quant_type,
bnb_4bit_compute_dtype=compute_dtype,
bnb_4bit_use_double_quant=args.use_nested_quant,
)
if compute_dtype == torch.float16 and args.use_4bit:
major, _ = torch.cuda.get_device_capability()
if major >= 8:
print("=" * 80)
print("Your GPU supports bfloat16, you can accelerate training with the argument --bf16")
print("=" * 80)
device_map = {"": 0}
model = AutoModelForCausalLM.from_pretrained(
args.model_name, quantization_config=bnb_config, device_map=device_map, trust_remote_code=True
)
peft_config = LoraConfig(
lora_alpha=script_args.lora_alpha,
lora_dropout=script_args.lora_dropout,
r=script_args.lora_r,
bias="none",
task_type="CAUSAL_LM",
target_modules=[
"query_key_value",
"dense",
"dense_h_to_4h",
"dense_4h_to_h",
], # , "word_embeddings", "lm_head"],
)
tokenizer = AutoTokenizer.from_pretrained(script_args.model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
return model, peft_config, tokenizer
training_arguments = TrainingArguments(
output_dir="./results",
per_device_train_batch_size=script_args.per_device_train_batch_size,
gradient_accumulation_steps=script_args.gradient_accumulation_steps,
optim=script_args.optim,
save_steps=script_args.save_steps,
logging_steps=script_args.logging_steps,
learning_rate=script_args.learning_rate,
fp16=script_args.fp16,
bf16=script_args.bf16,
max_grad_norm=script_args.max_grad_norm,
max_steps=script_args.max_steps,
warmup_ratio=script_args.warmup_ratio,
group_by_length=script_args.group_by_length,
lr_scheduler_type=script_args.lr_scheduler_type,
)
model, peft_config, tokenizer = create_and_prepare_model(script_args)
model.config.use_cache = False
dataset = load_dataset(script_args.dataset_name, split="train")
trainer = SFTTrainer(
model=model,
train_dataset=dataset,
peft_config=peft_config,
dataset_text_field="text",
max_seq_length=script_args.max_seq_length,
tokenizer=tokenizer,
args=training_arguments,
packing=script_args.packing,
)
for name, module in trainer.model.named_modules():
if isinstance(module, LoraLayer):
if script_args.bf16:
module = module.to(torch.bfloat16)
if "norm" in name:
module = module.to(torch.float32)
if "lm_head" in name or "embed_tokens" in name:
if hasattr(module, "weight"):
if script_args.bf16 and module.weight.dtype == torch.float32:
module = module.to(torch.bfloat16)
trainer.train()
@alexanderfrey
Copy link

same here:
result = F.linear(x, transpose(self.weight, self.fan_in_fan_out), bias=self.bias)
RuntimeError: mat1 and mat2 shapes cannot be multiplied (2048x4544 and 1x10614784)

@alexanderfrey
Copy link

Ok its fixed. You have to upgrade peft to 0.4.0.dev0

@alexanderfrey
Copy link

alexanderfrey commented Jul 14, 2023

Just curious - does anyone specify the eval_dataset for SFTTrainer ?

@anshoomehra
Copy link

@alexanderfrey

  1. Eval set, you should be able to run without or just to test things up can even carve out a certain % from the training set.

  2. Tried upgrading to peft 0.4.0.dev0 old error is gone but the new one surfaced :-)

RuntimeError: Expected to mark a variable ready only once. This error is caused by one of the following reasons: 1) Use of a module parameter outside the forward function. Please make sure model parameters are not shared across multiple concurrent forward-backward passes. or try to use _set_static_graph() as a workaround if this module graph does not change during training loop.2) Reused parameters in multiple reentrant backward passes. For example, if you use multiple checkpoint functions to wrap the same part of your model, it would result in the same set of parameters been used by different reentrant backward passes multiple times, and hence marking a variable ready multiple times. DDP does not support such use cases in default. You can try to use _set_static_graph() as a workaround if your module graph does not change over iterations.
Parameter at index 255 has been marked as ready twice. This means that multiple autograd engine hooks have fired for this particular parameter during this iteration. You can set the environment variable TORCH_DISTRIBUTED_DEBUG to either INFO or DETAIL to print parameter names for further debugging.

@anshoomehra
Copy link

Does the above code work with multiple GPUs?

You will need accelerate or deepspeed.

@xzrderek
Copy link

xzrderek commented Jul 14, 2023

I'm not familiar with accelerate, could you tell me more about how I would implement it? Also how can I use my own dataset (i.e. if it is a JSON file) with the code above?

@vejvarm
Copy link

vejvarm commented Jul 17, 2023

@okoliechykwuka

ValueError: Some specified arguments are not used by the HfArgumentParser: ['-f', '/root/.local/share/jupyter/runtime/kernel-a3d273a8-3e31-4ff4-b37c-9a51367dc60d.json']

I am having the above error.

I installed the required packages using this.

!pip install -q -U git+https://github.com/lvwerra/trl.git git+https://github.com/huggingface/transformers.git git+https://github.com/huggingface/accelerate.git git+https://github.com/huggingface/peft.git
!pip install -q datasets bitsandbytes einops wandb

I think the problem is with jupyter adding the -f argument to sys.argv automatically when you run a cell.
You can fix it by parsing args=[] to the parser.parse_args_into_dataclasses, which will override the command line arguments.
i.e. try changing this line:
script_args = parser.parse_args_into_dataclasses(args=[])

@Einengutenmorgen
Copy link

How do I continue from a specific checkpoint to fine-tune with different data?
I tried to use the same script as for training,

def create_and_prepare_model():
compute_dtype = getattr(torch, "float16")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=True,
)
tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-7b", trust_remote_code=True)
tokenizer.add_special_tokens({'pad_token': '[PAD]', 'eos_token': '[EOS]'})
model = AutoModelForCausalLM.from_pretrained( "tiiuae/falcon-7b", quantization_config=bnb_config, device_map="auto", trust_remote_code=True)


model = PeftModel.from_pretrained(model, <Path to adapter_config.json>)

peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[
        "query_key_value"
    ],
)
model.resize_token_embeddings(len(tokenizer))
return model, peft_config, tokenizer

ERROR:
Traceback (most recent call last):
File "/opt/conda/lib/python3.10/site-packages/peft/utils/config.py", line 177, in _get_peft_type
config_file = hf_hub_download(
File "/opt/conda/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py", line 110, in _inner_fn
validate_repo_id(arg_value)
File "/opt/conda/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py", line 158, in validate_repo_id
raise HFValidationError(
huggingface_hub.utils._validators.HFValidationError: Repo id must be in the form 'repo_name' or 'namespace/repo_name': '/home/ec2-user/SageMaker/Models/Model_10_multinews/checkpoint-13000/'. Use repo_type argument if needed.

@lamwilton
Copy link

After upgrading all packages I am getting this error also

File "/root/.cache/huggingface/modules/transformers_modules/tiiuae/falcon-7b/378337427557d1df3e742264a2901a49f25d4eb1/modelling_RW.py", line 93, in forward
return (q * cos) + (rotate_half(q) * sin), (k * cos) + (rotate_half(k) * sin)
TypeError: unsupported operand type(s) for : 'Tensor' and 'NoneType'*

@xzrderek
Copy link

xzrderek commented Jul 20, 2023

My code runs but I am getting a spiky loss chart that looks like this:

image

Does anyone know why this is happening/what hyperparameters I should be tuning? I am running this command, mentioned by @pacman100:

python train.py \
--model_name tiiuae/falcon-40b \
--max_seq_len 2048 \
--bf16 \
--group_by_length \
--bnb_4bit_compute_dtype bfloat16 \
--max_steps 200

EDIT: seems like trl is causing this issue, forcing a pip install to trl==0.4.6 fixes it.

@mtisz
Copy link

mtisz commented Jul 25, 2023

Any of you running into issues with models generating too much text? Like, my models aren't stopping the generation, it doesn't know when to stop.. Does line 173 tokenizer.pad_token = tokenizer.eos_token actually help? Or do I need to append the eos_token at the end of each training line?

{"text": "You are an AI assistant that follows instruction extremely well. Help as much as you can.\nUSER: Read the following paragraph and extract the answer for the question: What is the last name of the person who composed seven operas in three years at Brunswick? Handel joined the Hamburg opera house when it was experiencing a period of considerable artistic success. This blossoming followed the arrival of Reinhard Keiser, who had become musical director at the G\u00e4nsemarkt in about 1697, and in 1703 succeeded Johann Kusser as the theatre's manager. Born in 1674, Keiser had studied under Johann Schelle and probably Johann Kuhnau at the Thomasschule zu Leipzig. In 1694 he was employed as a court composer at Brunswick, where in three years he composed seven operas, at least one of which (Mahumeth) was performed in Hamburg. According to Handel's biographer Donald Burrows, Keiser was a good judge of popular taste, with a flair for writing Italian-style arias. Between 1697 and 1703, prior to Handel's arrival, about a dozen more Keiser operas had been staged at the G\u00e4nsemarkt. Despite his on-stage successes, Keiser was an unreliable general manager, with expensive private tastes and little financial acumen, often at odds with his creditors.It is possible that Keiser, who had connections in the Halle area, had heard of Handel and was directly instrumental in securing the latter's post in the G\u00e4nsemarkt orchestra; certainly he was a considerable influence on the younger man in the three years that Handel spent in Hamburg. Another important G\u00e4nsemarkt colleague was the house composer and singer Johann Mattheson, who noted Handel's rapid progress in the orchestra from back-desk violinist to harpsichord soloist, a role in which, said Mattheson, \"he showed himself a man\u2014a thing which no one had before suspected, save I alone\". Mattheson was less complimentary on Handel's early efforts at composition: \"He composed very long, long arias, and really interminable cantatas\", before, it seems, \"the lofty schooling of opera ... trimmed him into other fashions\":\nASSISTANT: Keiser\n"}

Do I need to append this with </s> (for LLaMA)? Or do I need to append the line with "USER:" so the model adds that term at the end of each generated response, so I can strip the text before it?

@xzrderek
Copy link

does anyone know how can I load model from checkpoint to continue training? I've trained it with my dataset, now I want to pick it up from the checkpoint and continue the training but using the guanaco dataset on top. Is it possible?

@rogeriochaves did you ever figure out if this was possible?

@DanM3rcurius
Copy link

GM
I'm running into memory issues when training. Not sure if this is due to my graphics card not being powerful enough?
if someone could point a noob like me to what I'm doing wrong, that would be much appreciated.

OutOfMemoryError: CUDA out of memory. Tried to allocate 72.00 MiB. GPU 0 has a total capacty of 5.80 GiB of which 47.69 MiB is free. Including non-PyTorch memory, this process has 5.74 GiB memory in use. Of the allocated memory 5.55 GiB is allocated by PyTorch, and 54.36 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.129.03 Driver Version: 535.129.03 CUDA Version: 12.2 |
|-----------------------------------------+----------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+======================+======================|
| 0 NVIDIA GeForce RTX 3060 ... Off | 00000000:01:00.0 Off | N/A |
| N/A 41C P8 10W / 80W | 5890MiB / 6144MiB | 0% Default |
| | | N/A |
+-----------------------------------------+----------------------+----------------------+

+---------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=======================================================================================|
| 0 N/A N/A 1348 G /usr/lib/Xorg 4MiB |
| 0 N/A N/A 3446 C /home/mercurius/LLMs/llmenv/bin/python 5876MiB |
+---------------------------------------------------------------------------------------+

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment