helena-intel/_README.md Secret

## _README.md

      
    Raw
  

              _README.md
            
          
    This gist contains three scripts:

A basic inferencing example
A script that shows how to quantize a sentence-transformers model for use with OpenVINO
A script that shows how to evaluate a quantized model, comparing the INT8 model with the FP32 model

Basic usage


NOTE: The PR to add OpenVINO support to sentence-transformers has not been merged yet. For now, install
sentence-transformers with
pip install "git+https://github.com/helena-intel/sentence-transformers.git@helena/openvino-support"

To use sentence-transformers with OpenVINO, simply add backend="openvino" to the SentenceTransformers() model initialization. model_name_or_path can refer to a model_id on the Hugging Face Hub, or a path to a local directory with a compatible model. If a model_id with a PyTorch model is provided, it will be converted to OpenVINO on the fly.
model = SentenceTransformer(model_name_or_path, backend="openvino")
You can save this OpenVINO model and load it directly:
# load a model from the Hugging Face Hub and convert to OpenVINO on the fly
model = SentenceTransformer("BAAI/bge-base-en-v1.5", backend="openvino")
# save the model
model.save("bge-base-en-v.1.5-ov")
# load the saved OpenVINO model
model = SentenceTransformer("bge-base-en-v.1.5-ov", backend="openvino")
Using an OpenVINO config

To use an OpenVINO config, set ov_config in model_kwargs. ov_config can either be a dictionary
with an OpenVINO config, or point to a .json file with an OpenVINO config:
model = SentenceTransformer("BAAI/bge-base-en-v1.5", backend="openvino", model_kwargs = {"ov_config": {"INFERENCE_PRECISION_HINT": "f32"})
model = SentenceTransformer("BAAI/bge-base-en-v1.5", backend="openvino", model_kwargs = {"ov_config": "ov_config.json"})
Using Intel GPU accelerator

To use an Intel iGPU or dGPU for inference, set model_kwargs["device"] to GPU:
model = SentenceTransformer("BAAI/bge-base-en-v1.5", backend="openvino", model_kwargs = {"device": "GPU"})

NOTE: do not set the device argument directly.

Quantizing

See the quantization and evaluation scripts in this gist for an example of how to quantize models for use with OpenVINO.

  
## basic_sentence_transformers_inference_openvino.py
# pip install optimum[openvino] mteb "git+https://github.com/helena-intel/sentence-transformers.git@helena/openvino-support"

from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer

model = SentenceTransformer("BAAI/bge-base-en-v1.5", backend="openvino")
tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-base-en-v1.5")
print(model.encode("hello"))

## evaluate_sentencetransformers.py
# pip install optimum[openvino] mteb sentence-transformers
# run quantize_sentencetransformers.py first, to create the OpenVINO FP32 and INT8 models.
# tested with mteb 1.18.6, sentence-transformers 3.2.1, optimum-intel 1.20.1

import gc
import json
from pathlib import Path

import datasets
import mteb
import pandas as pd
import transformers
from sentence_transformers import SentenceTransformer

transformers.logging.set_verbosity_error()
datasets.logging.set_verbosity_error()


def benchmark(model_id, output_folder, device):
    base_model_path = Path(f"models/{model_id}")
    fp32_model_path = base_model_path.with_name(base_model_path.name + "-ov-fp32")
    int8_ptq_model_path = base_model_path.with_name(base_model_path.name + "-ov-int8-ptq")

    results = {}

    tasks = ["STSB", "ImdbClassification"]
    tasks = ["CPUSpeedTask"]
    tasks = ["Banking77Classification", "CPUSpeedTask"]
    models = {"openvino_int8": int8_ptq_model_path, "openvino_fp32": fp32_model_path, "pytorch": model_id}

    for model_desc, model_path in models.items():
        mteb_tasks = mteb.get_tasks(tasks=tasks, languages=["eng"])
        evaluation = mteb.MTEB(tasks=mteb_tasks)
        if "openvino" in model_desc:
            model = SentenceTransformer(str(model_path), backend="openvino", model_kwargs={"device": device})
        elif model_desc == "pytorch":
            if device.lower() != "gpu":
                model = SentenceTransformer(str(model_path), backend="torch")
            else:
                # Run task on GPU only for OpenVINO models
                break

        results[f"accuracy_{model_desc}"] = evaluation.run(
            model, overwrite_results=True, output_folder=f"{output_folder}/{device}/{model_desc}", co2_tracker=True
        )
        del model
        gc.collect()

    print(results)


### Summarize results


def summarize(output_folder):
    def custom_sort_key(item):
        first_columns = ["evaluation_time", "g_co2_emissions"]
        if item in first_columns:
            return first_columns.index(item)
        else:
            return 2.5

    global_metrics = ["evaluation_time", "kg_co2_emissions"]
    scores_metrics = {
        "Banking77Classification": ["f1"],
        "ImdbClassification": ["f1"],
        "STSB": ["pearson"],
        "CPUSpeedTask": ["avg_words_per_sec"],
    }

    records = []
    for fn in Path(output_folder).glob("**/*.json"):
        if fn.stem != "model_meta":
            device = fn.parts[1]
            modeltype = fn.parts[2]
            task = fn.stem
            with open(fn) as f:
                result = json.load(f)

            for score_metric in scores_metrics[task]:
                metric = result["scores"]["test"][0][score_metric]
                records.append([modeltype, device, task, score_metric, round(metric, 3)])
            for global_metric in global_metrics:
                metric = result[global_metric]
                if global_metric == "kg_co2_emissions":
                    metric *= 1000
                    global_metric = "g_co2_emissions"
                records.append([modeltype, device, task, global_metric, round(metric, 3)])

    df = pd.DataFrame.from_records(records, columns=["modeltype", "device", "task", "metric", "value"])

    for task in df.task.unique():
        subdf = df[df.task == task]
        pivot = subdf.pivot_table(index=["task", "device", "modeltype"], columns="metric", values="value")
        pivot.columns = sorted(pivot.columns.tolist(), key=custom_sort_key)
        print(pivot)


if __name__ == "__main__":
    model_id = "BAAI/bge-base-en-v1.5"
    output_folder = "mteb_results"
    for device in ("CPU", "GPU"):
        benchmark(model_id, output_folder, device)
        summarize(output_folder)

## quantize_sentencetransformers.py
# pip install optimum[openvino]

from functools import partial

from optimum.intel.openvino import OVConfig, OVModelForFeatureExtraction, OVQuantizationConfig, OVQuantizer
from transformers import AutoTokenizer, AutoConfig

MODEL_ID = "BAAI/bge-base-en-v1.5"
DATASET_NAME = "sentence-transformers/sentence-compression"
DATASET_COLUMN = "text"
INT8_SAVE_DIR = f"models/{MODEL_ID}-ov-int8-ptq"
FP32_SAVE_DIR = f"models/{MODEL_ID}-ov-fp32"


def preprocess_function(examples, tokenizer):
    """convert the text from the dataset into tokens in the format that the model expects"""
    config = AutoConfig.from_pretrained(MODEL_ID)
    return tokenizer(
        examples[DATASET_COLUMN],
        padding="max_length",
        max_length=config.max_position_embeddings,
        truncation=True,
        return_tensors="pt",
    )


model = OVModelForFeatureExtraction.from_pretrained(MODEL_ID, export=True)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model.save_pretrained(FP32_SAVE_DIR)
tokenizer.save_pretrained(FP32_SAVE_DIR)
quantizer = OVQuantizer.from_pretrained(model)
ov_config = OVConfig(quantization_config=OVQuantizationConfig())

calibration_dataset = quantizer.get_calibration_dataset(
    DATASET_NAME,
    preprocess_function=partial(preprocess_function, tokenizer=tokenizer),
    num_samples=300,
    dataset_split="train",
)

quantizer.quantize(ov_config=ov_config, calibration_dataset=calibration_dataset, save_directory=INT8_SAVE_DIR)
tokenizer.save_pretrained(INT8_SAVE_DIR)
	# pip install optimum[openvino] mteb "git+https://github.com/helena-intel/sentence-transformers.git@helena/openvino-support"

	from sentence_transformers import SentenceTransformer
	from transformers import AutoTokenizer

	model = SentenceTransformer("BAAI/bge-base-en-v1.5", backend="openvino")
	tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-base-en-v1.5")
	print(model.encode("hello"))
	# pip install optimum[openvino] mteb sentence-transformers
	# run quantize_sentencetransformers.py first, to create the OpenVINO FP32 and INT8 models.
	# tested with mteb 1.18.6, sentence-transformers 3.2.1, optimum-intel 1.20.1

	import gc
	import json
	from pathlib import Path

	import datasets
	import mteb
	import pandas as pd
	import transformers
	from sentence_transformers import SentenceTransformer

	transformers.logging.set_verbosity_error()
	datasets.logging.set_verbosity_error()


	def benchmark(model_id, output_folder, device):
	base_model_path = Path(f"models/{model_id}")
	fp32_model_path = base_model_path.with_name(base_model_path.name + "-ov-fp32")
	int8_ptq_model_path = base_model_path.with_name(base_model_path.name + "-ov-int8-ptq")

	results = {}

	tasks = ["STSB", "ImdbClassification"]
	tasks = ["CPUSpeedTask"]
	tasks = ["Banking77Classification", "CPUSpeedTask"]
	models = {"openvino_int8": int8_ptq_model_path, "openvino_fp32": fp32_model_path, "pytorch": model_id}

	for model_desc, model_path in models.items():
	mteb_tasks = mteb.get_tasks(tasks=tasks, languages=["eng"])
	evaluation = mteb.MTEB(tasks=mteb_tasks)
	if "openvino" in model_desc:
	model = SentenceTransformer(str(model_path), backend="openvino", model_kwargs={"device": device})
	elif model_desc == "pytorch":
	if device.lower() != "gpu":
	model = SentenceTransformer(str(model_path), backend="torch")
	else:
	# Run task on GPU only for OpenVINO models
	break

	results[f"accuracy_{model_desc}"] = evaluation.run(
	model, overwrite_results=True, output_folder=f"{output_folder}/{device}/{model_desc}", co2_tracker=True
	)
	del model
	gc.collect()

	print(results)


	### Summarize results


	def summarize(output_folder):
	def custom_sort_key(item):
	first_columns = ["evaluation_time", "g_co2_emissions"]
	if item in first_columns:
	return first_columns.index(item)
	else:
	return 2.5

	global_metrics = ["evaluation_time", "kg_co2_emissions"]
	scores_metrics = {
	"Banking77Classification": ["f1"],
	"ImdbClassification": ["f1"],
	"STSB": ["pearson"],
	"CPUSpeedTask": ["avg_words_per_sec"],
	}

	records = []
	for fn in Path(output_folder).glob("*/.json"):
	if fn.stem != "model_meta":
	device = fn.parts[1]
	modeltype = fn.parts[2]
	task = fn.stem
	with open(fn) as f:
	result = json.load(f)

	for score_metric in scores_metrics[task]:
	metric = result["scores"]["test"][0][score_metric]
	records.append([modeltype, device, task, score_metric, round(metric, 3)])
	for global_metric in global_metrics:
	metric = result[global_metric]
	if global_metric == "kg_co2_emissions":
	metric *= 1000
	global_metric = "g_co2_emissions"
	records.append([modeltype, device, task, global_metric, round(metric, 3)])

	df = pd.DataFrame.from_records(records, columns=["modeltype", "device", "task", "metric", "value"])

	for task in df.task.unique():
	subdf = df[df.task == task]
	pivot = subdf.pivot_table(index=["task", "device", "modeltype"], columns="metric", values="value")
	pivot.columns = sorted(pivot.columns.tolist(), key=custom_sort_key)
	print(pivot)


	if __name__ == "__main__":
	model_id = "BAAI/bge-base-en-v1.5"
	output_folder = "mteb_results"
	for device in ("CPU", "GPU"):
	benchmark(model_id, output_folder, device)
	summarize(output_folder)
	# pip install optimum[openvino]

	from functools import partial

	from optimum.intel.openvino import OVConfig, OVModelForFeatureExtraction, OVQuantizationConfig, OVQuantizer
	from transformers import AutoTokenizer, AutoConfig

	MODEL_ID = "BAAI/bge-base-en-v1.5"
	DATASET_NAME = "sentence-transformers/sentence-compression"
	DATASET_COLUMN = "text"
	INT8_SAVE_DIR = f"models/{MODEL_ID}-ov-int8-ptq"
	FP32_SAVE_DIR = f"models/{MODEL_ID}-ov-fp32"


	def preprocess_function(examples, tokenizer):
	"""convert the text from the dataset into tokens in the format that the model expects"""
	config = AutoConfig.from_pretrained(MODEL_ID)
	return tokenizer(
	examples[DATASET_COLUMN],
	padding="max_length",
	max_length=config.max_position_embeddings,
	truncation=True,
	return_tensors="pt",
	)


	model = OVModelForFeatureExtraction.from_pretrained(MODEL_ID, export=True)
	tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
	model.save_pretrained(FP32_SAVE_DIR)
	tokenizer.save_pretrained(FP32_SAVE_DIR)
	quantizer = OVQuantizer.from_pretrained(model)
	ov_config = OVConfig(quantization_config=OVQuantizationConfig())

	calibration_dataset = quantizer.get_calibration_dataset(
	DATASET_NAME,
	preprocess_function=partial(preprocess_function, tokenizer=tokenizer),
	num_samples=300,
	dataset_split="train",
	)

	quantizer.quantize(ov_config=ov_config, calibration_dataset=calibration_dataset, save_directory=INT8_SAVE_DIR)
	tokenizer.save_pretrained(INT8_SAVE_DIR)