Doron Adler Norod

## neo2gpt2.py
import torch
from transformers import GPT2LMHeadModel, GPTNeoForCausalLM, GPT2Config

def convert_neo_to_gpt2(neo_model_path, output_path, target_positions=1024):
    # Load the trained GPT-Neo model
    neo_model = GPTNeoForCausalLM.from_pretrained(neo_model_path)

    # Create a GPT-2 config matching GPT-Neo's structure but with reduced position embeddings
    gpt2_config = GPT2Config(
        vocab_size=neo_model.config.vocab_size,

## obj_mesh_to_spritesheet.py
import trimesh
from PIL import Image
import numpy as np
import io

# If you are running on Apple Silicon, you may need to comment out the
# following lines as described in this GitHub issue
# to avoid running into an issue with the trimesh library:
# https://github.com/mikedh/trimesh/issues/2084#issuecomment-1840072858

## obj_mesh_depth_bucket_disconnect.py
import numpy as np
import trimesh

def load_mesh(file_path):
    return trimesh.load(file_path)

def compute_depth_ranges(mesh, num_buckets=5):
    # Extract vertex depths (assuming z-coordinate represents depth)
    depths = mesh.vertices[:, 2]


## combine_tokenizers.py
"""
Given two tokenizers, combine them and create a new tokenizer
Usage: python combine_tokenizers.py --tokenizer1 ./SmolLM-135M --tokenizer2 ./hebrew-14k --save_dir ./combined

Source: https://github.com/huggingface/tokenizers/issues/690#issuecomment-830665989
"""


# Libraries for tokenizer
from pathlib import Path

## Training_a_new_tokenizer_from_an_old_one.ipynb

      
              3 files
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                Norod
                / Training_a_new_tokenizer_from_an_old_one.ipynb
            
            
              Created
              July 18, 2024 13:02
            
              
                A set of scripts for: training a small tokenizer in a new language, merging small tokinizer with existing one and saving the combined and resized model
              
          
      Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## prepare_jsonl_dataset_file_from_txt_folder.py
import os
import json
from glob import glob
from torch.utils.data import IterableDataset, DataLoader

class BatchProcessedDataset(IterableDataset):
    """
    A dataset which streams and processes lines from files, concatenating a specified number of lines.
    """
    def __init__(self, files, batch_size=4096, lines_per_entry=20):

## heb_tokenize_compare.py
from transformers import AutoTokenizer
from transformers import LlamaTokenizerFast

tokenizer_grok = LlamaTokenizerFast.from_pretrained('Xenova/grok-1-tokenizer')
tokenizer_gemma = AutoTokenizer.from_pretrained("google/gemma-7b-it")
tokenizer_aya101 = AutoTokenizer.from_pretrained("CohereForAI/aya-101")
tokenizer_gpt2 = AutoTokenizer.from_pretrained("gpt2")

# prompt_text='''מודל ראשון בגודל 6-מיליארד פרמטרים מתאמן כרגע על חלק מהדאטסטים שהגבתם, עכשיו כשהמודל על האש אני אתפנה לענות לכולם. מתנצל על העיכוב, קיבלתי המון הודעות ולא ציפיתי לכזו הענות, אתם אדירים!
# שלב הבא: להרכיב דאטהסט אחד ענק מכל הרעיונות והלינקים שצירפתם בשביל האימון המרכזי.'''

## apple_openelm-3b_cuda_gradio-demo.ipynb

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                Norod
                / apple_openelm-3b_cuda_gradio-demo.ipynb
            
            
              Last active
              April 30, 2024 09:40
            
              
                apple_openelm-3b_cuda_gradio-demo.ipynb
              
          
      Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## apple_openelm-270m_cpu_gradio-demo.ipynb

      
              1 file
            
          
              1 fork
            
          
              0 comments
            
          
              1 star
            
          
                Norod
                / apple_openelm-270m_cpu_gradio-demo.ipynb
            
            
              Created
              April 24, 2024 17:08
            
              
                Apple_OpenELM-270M_cpu_Gradio-Demo.ipynb
              
          
      Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## heb_tokenize_compare.py
from transformers import AutoTokenizer
from transformers import LlamaTokenizerFast


#tokenizer_yam = AutoTokenizer.from_pretrained("yam-peleg/Hebrew-Gemma-11B-V2")
tokenizer_grok = LlamaTokenizerFast.from_pretrained('Xenova/grok-1-tokenizer')
tokenizer_gemma = AutoTokenizer.from_pretrained("google/gemma-7b-it")
tokenizer_aya101 = AutoTokenizer.from_pretrained("CohereForAI/aya-101")
tokenizer_gpt2 = AutoTokenizer.from_pretrained("gpt2")
	import torch
	from transformers import GPT2LMHeadModel, GPTNeoForCausalLM, GPT2Config

	def convert_neo_to_gpt2(neo_model_path, output_path, target_positions=1024):
	# Load the trained GPT-Neo model
	neo_model = GPTNeoForCausalLM.from_pretrained(neo_model_path)

	# Create a GPT-2 config matching GPT-Neo's structure but with reduced position embeddings
	gpt2_config = GPT2Config(
	vocab_size=neo_model.config.vocab_size,
	import trimesh
	from PIL import Image
	import numpy as np
	import io

	# If you are running on Apple Silicon, you may need to comment out the
	# following lines as described in this GitHub issue
	# to avoid running into an issue with the trimesh library:
	# https://github.com/mikedh/trimesh/issues/2084#issuecomment-1840072858
	import numpy as np
	import trimesh

	def load_mesh(file_path):
	return trimesh.load(file_path)

	def compute_depth_ranges(mesh, num_buckets=5):
	# Extract vertex depths (assuming z-coordinate represents depth)
	depths = mesh.vertices[:, 2]
	"""
	Given two tokenizers, combine them and create a new tokenizer
	Usage: python combine_tokenizers.py --tokenizer1 ./SmolLM-135M --tokenizer2 ./hebrew-14k --save_dir ./combined

	Source: https://github.com/huggingface/tokenizers/issues/690#issuecomment-830665989
	"""


	# Libraries for tokenizer
	from pathlib import Path
	import os
	import json
	from glob import glob
	from torch.utils.data import IterableDataset, DataLoader

	class BatchProcessedDataset(IterableDataset):
	"""
	A dataset which streams and processes lines from files, concatenating a specified number of lines.
	"""
	def __init__(self, files, batch_size=4096, lines_per_entry=20):
	from transformers import AutoTokenizer
	from transformers import LlamaTokenizerFast

	tokenizer_grok = LlamaTokenizerFast.from_pretrained('Xenova/grok-1-tokenizer')
	tokenizer_gemma = AutoTokenizer.from_pretrained("google/gemma-7b-it")
	tokenizer_aya101 = AutoTokenizer.from_pretrained("CohereForAI/aya-101")
	tokenizer_gpt2 = AutoTokenizer.from_pretrained("gpt2")

	# prompt_text='''מודל ראשון בגודל 6-מיליארד פרמטרים מתאמן כרגע על חלק מהדאטסטים שהגבתם, עכשיו כשהמודל על האש אני אתפנה לענות לכולם. מתנצל על העיכוב, קיבלתי המון הודעות ולא ציפיתי לכזו הענות, אתם אדירים!
	# שלב הבא: להרכיב דאטהסט אחד ענק מכל הרעיונות והלינקים שצירפתם בשביל האימון המרכזי.'''
	from transformers import AutoTokenizer
	from transformers import LlamaTokenizerFast



	#tokenizer_yam = AutoTokenizer.from_pretrained("yam-peleg/Hebrew-Gemma-11B-V2")
	tokenizer_grok = LlamaTokenizerFast.from_pretrained('Xenova/grok-1-tokenizer')
	tokenizer_gemma = AutoTokenizer.from_pretrained("google/gemma-7b-it")
	tokenizer_aya101 = AutoTokenizer.from_pretrained("CohereForAI/aya-101")
	tokenizer_gpt2 = AutoTokenizer.from_pretrained("gpt2")