intervitens/tokenizer_fix.py

## tokenizer_fix.py
# Launch with PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python python tokenizer_fix.py

import sentencepiece.sentencepiece_model_pb2 as model

m = model.ModelProto()
m.ParseFromString(open('./tokenizer.model', 'rb').read())
m.pieces[92543].piece = '<|im_start|>'
m.pieces[92542].piece = '<|im_end|>'
m.pieces[92541].piece = '<|action_start|>'
m.pieces[92540].piece = '<|action_end|>'
m.pieces[92539].piece = '<|interpreter|>'
m.pieces[92538].piece = '<|plugin|>'
m.pieces[354].piece = "[ERROR_NULL_TOKEN_a76Y96a9eX7b]"
with open('tokenizer_fixed.model', 'wb') as f:
    f.write(m.SerializeToString())
	# Launch with PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python python tokenizer_fix.py

	import sentencepiece.sentencepiece_model_pb2 as model

	m = model.ModelProto()
	m.ParseFromString(open('./tokenizer.model', 'rb').read())
	m.pieces[92543].piece = '<\|im_start\|>'
	m.pieces[92542].piece = '<\|im_end\|>'
	m.pieces[92541].piece = '<\|action_start\|>'
	m.pieces[92540].piece = '<\|action_end\|>'
	m.pieces[92539].piece = '<\|interpreter\|>'
	m.pieces[92538].piece = '<\|plugin\|>'
	m.pieces[354].piece = "[ERROR_NULL_TOKEN_a76Y96a9eX7b]"
	with open('tokenizer_fixed.model', 'wb') as f:
	f.write(m.SerializeToString())