Skip to content

Instantly share code, notes, and snippets.

@brando90
brando90 / multiple_gpus_1_file.py
Created December 16, 2024 21:44
multiple_gpus_1_file.py
def main():
import os
import sys
import socket
print(sys.executable)
if socket.gethostname() == 'skampere1':
print('Hardcoding the path since we are in skampere')
sys.path = ['', '/lfs/skampere1/0/brando9/miniconda/envs/beyond_scale_2/lib/python311.zip', '/lfs/skampere1/0/brando9/miniconda/envs/beyond_scale_2/lib/python3.11', '/lfs/skampere1/0/brando9/miniconda/envs/beyond_scale_2/lib/python3.11/lib-dynload', '/lfs/skampere1/0/brando9/miniconda/envs/beyond_scale_2/lib/python3.11/site-packages', '/afs/cs.stanford.edu/u/brando9/beyond-scale-2-alignment-coeff/py_src', '/afs/cs.stanford.edu/u/brando9/ultimate-utils/py_src']
print(f'{sys.path=}')
@brando90
brando90 / training_guidelines.md
Created November 29, 2024 21:59
nothing bellow 16 bits for training

Training Guidelines Summary

  • SFT: Use bf16 or fp32 for training; avoid 8bit. For evaluation, fp16, bf16, or fp32 is fine. Follow established scripts for reliability.
  • Unsloth: Train LoRA with fp16, bf16, or fp32. Avoid 8bit or lower unless validated through replication of original experiments. No QLoRA unless core setups are stable and everything before this has worked.
import torch
# Create two matrices on the GPU
matrix_a = torch.rand((1000, 1000), device='cuda')
matrix_b = torch.rand((1000, 1000), device='cuda')
# Perform matrix sum
result = matrix_a + matrix_b
# Verify and print device of the result
@brando90
brando90 / gemma_tok_how_does_mask_look_if_eos_pad_both_present_in_tok.py
Last active November 21, 2024 04:43
gemma 2 2b tokenizer properly adding eos padding and masking
# ref: https://chatgpt.com/c/673e8232-0a18-8001-9fb5-ed1262bf267f
# ref: https://gist.github.com/brando90/4cd94ad3730218dca75dba779f770c9d
from transformers import AutoTokenizer
def analyze_tokenizer_output(model_name, text, pad_token="<pad>", eos_token="</s>", max_length=20):
"""
Analyzes the tokenizer output, including the attention mask and labels,
when eos_token and pad_token are present.
"""
# Load the tokenizer
@brando90
brando90 / teacher_forced_accuracy.py
Created November 21, 2024 00:29
teacher_forced_accuracy.py
#ref: https://chatgpt.com/share/673e7ef2-23cc-8001-b682-3ff4b66c797a
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
def compute_tfa(model, tokenizer, input_texts):
"""
Computes Teacher-Forced Accuracy (TFA), rewarding the model for correctly predicting
the first EOS token while ignoring predictions for padding tokens.
Parameters:
{
    "source": "...",
    "id": "...",
    "attributes": {
      "compression_ratio_zstd": 0.7
    }
}
@brando90
brando90 / gist:c55c74e840d42c952d4aec7b74e0be6c
Created August 14, 2024 19:01
vllm_python_potential_way_to_resolve_issues.txt
# # 'torch==2.1.2', # 2.2 not supported due to vllm see: https://github.com/vllm-project/vllm/issues/2747
...
# 'transformers>=4.40',
# 'accelerate==0.29.2',
...
# 'datasets==2.14.7',
# 'evaluate==0.4.1',
# 'bitsandbytes== 0.43.0',
# 'einops',
# 'flash-attn>=2.5.8',
@brando90
brando90 / maf_dual_backtranslation_self_improving.md
Created June 7, 2024 17:53
MAF dual backtranslation self-improving loop

Simplified version of Dually Ground BackTranslation for AutoFormalization:

def train_to_af_for_maf(mdl : causal_lm,
                        formal_data_set, # e.g., ITP lib like mathlib
                        informal_data_set,  # e.g., time-tested maths textbook e.g., Rudin, CLRS.
                        ):
    for (nl, fl*) in formal_data_set; for (nl*, fl) in informal_data_set;
        # -- Learn to Formalize: nl_i->fl* from fl* -> [nl_i]_i -> fl*
        [nl_i]_i := mdl("informalize " + fl*, sampling=top_p, num_out=k)  # noise is good for robustness!
from datasets import load_dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
# Load dataset from a JSON file
data_files = {"train": "path/to/your/train.json", "test": "path/to/your/test.json"}
dataset = load_dataset("json", data_files=data_files)
# Load pre-trained GPT-2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")
# -- HELM prompt, 8 shot, CoT? ref: https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/runs/v1.0.0/math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-34b/scenario_state.json, https://crfm.stanford.edu/helm/lite/latest/#/runs/math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-34b
HELM_MATH_PROMPT: str = (
"""Given a mathematics problem, determine the answer. Simplify your answer as much as possible.###
Problem: Let $r=3^s-s$ and $s=2^n+1$. What is the value of $r$ when $n=2$?
Answer: First substitute $n=2$ into the expression for $s$ to find $s=2^2+1=5$. Then substitute $s=5$ into the expression for $r$ to find $r=3^5-5=243-5=\\boxed{238}.###
Problem: If $x^{2y}= 4$ and $x = 4$, what is the value of $y$? Express your answer as a common fraction.
Answer: Plugging $x = 4$ into the first equation, we get $4^{2y} = 4^1 \\Rightarrow 2y = 1 \\Rightarrow y = \\boxed{\\frac{1}{2}}.###
Problem: If $y = \\dis