This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
from transformers import GPT2LMHeadModel, GPTNeoForCausalLM, GPT2Config | |
def convert_neo_to_gpt2(neo_model_path, output_path, target_positions=1024): | |
# Load the trained GPT-Neo model | |
neo_model = GPTNeoForCausalLM.from_pretrained(neo_model_path) | |
# Create a GPT-2 config matching GPT-Neo's structure but with reduced position embeddings | |
gpt2_config = GPT2Config( | |
vocab_size=neo_model.config.vocab_size, |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import trimesh | |
from PIL import Image | |
import numpy as np | |
import io | |
# If you are running on Apple Silicon, you may need to comment out the | |
# following lines as described in this GitHub issue | |
# to avoid running into an issue with the trimesh library: | |
# https://github.com/mikedh/trimesh/issues/2084#issuecomment-1840072858 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import trimesh | |
def load_mesh(file_path): | |
return trimesh.load(file_path) | |
def compute_depth_ranges(mesh, num_buckets=5): | |
# Extract vertex depths (assuming z-coordinate represents depth) | |
depths = mesh.vertices[:, 2] | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Given two tokenizers, combine them and create a new tokenizer | |
Usage: python combine_tokenizers.py --tokenizer1 ./SmolLM-135M --tokenizer2 ./hebrew-14k --save_dir ./combined | |
Source: https://github.com/huggingface/tokenizers/issues/690#issuecomment-830665989 | |
""" | |
# Libraries for tokenizer | |
from pathlib import Path |
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import json | |
from glob import glob | |
from torch.utils.data import IterableDataset, DataLoader | |
class BatchProcessedDataset(IterableDataset): | |
""" | |
A dataset which streams and processes lines from files, concatenating a specified number of lines. | |
""" | |
def __init__(self, files, batch_size=4096, lines_per_entry=20): |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from transformers import AutoTokenizer | |
from transformers import LlamaTokenizerFast | |
tokenizer_grok = LlamaTokenizerFast.from_pretrained('Xenova/grok-1-tokenizer') | |
tokenizer_gemma = AutoTokenizer.from_pretrained("google/gemma-7b-it") | |
tokenizer_aya101 = AutoTokenizer.from_pretrained("CohereForAI/aya-101") | |
tokenizer_gpt2 = AutoTokenizer.from_pretrained("gpt2") | |
# prompt_text='''מודל ראשון בגודל 6-מיליארד פרמטרים מתאמן כרגע על חלק מהדאטסטים שהגבתם, עכשיו כשהמודל על האש אני אתפנה לענות לכולם. מתנצל על העיכוב, קיבלתי המון הודעות ולא ציפיתי לכזו הענות, אתם אדירים! | |
# שלב הבא: להרכיב דאטהסט אחד ענק מכל הרעיונות והלינקים שצירפתם בשביל האימון המרכזי.''' |
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from transformers import AutoTokenizer | |
from transformers import LlamaTokenizerFast | |
#tokenizer_yam = AutoTokenizer.from_pretrained("yam-peleg/Hebrew-Gemma-11B-V2") | |
tokenizer_grok = LlamaTokenizerFast.from_pretrained('Xenova/grok-1-tokenizer') | |
tokenizer_gemma = AutoTokenizer.from_pretrained("google/gemma-7b-it") | |
tokenizer_aya101 = AutoTokenizer.from_pretrained("CohereForAI/aya-101") | |
tokenizer_gpt2 = AutoTokenizer.from_pretrained("gpt2") |
NewerOlder