Skip to content

Instantly share code, notes, and snippets.

@brando90
Created June 5, 2024 00:17
Show Gist options
  • Save brando90/23e96b1cf93372fb846b3e55aac162f1 to your computer and use it in GitHub Desktop.
Save brando90/23e96b1cf93372fb846b3e55aac162f1 to your computer and use it in GitHub Desktop.
loading_json_data.py
from datasets import load_dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
# Load dataset from a JSON file
data_files = {"train": "path/to/your/train.json", "test": "path/to/your/test.json"}
dataset = load_dataset("json", data_files=data_files)
# Load pre-trained GPT-2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")
# Tokenize the dataset
def tokenize_function(examples):
return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
tokenized_datasets = dataset.map(tokenize_function, batched=True)
# Data collator for language modeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
# Training arguments
training_args = TrainingArguments(
output_dir="./results",
evaluation_strategy="epoch",
learning_rate=2e-5,
weight_decay=0.01,
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
num_train_epochs=3,
save_steps=10_000,
save_total_limit=2,
)
# Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["test"],
data_collator=data_collator,
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment