Skip to content

Instantly share code, notes, and snippets.

@helena-intel
Last active November 7, 2024 17:47
Show Gist options
  • Save helena-intel/fe7ea16bc015a3d581f3a7417a35a87e to your computer and use it in GitHub Desktop.
Save helena-intel/fe7ea16bc015a3d581f3a7417a35a87e to your computer and use it in GitHub Desktop.
Quantize sentence-transformers models with Optimum Intel

This gist contains three scripts:

  • A basic inferencing example
  • A script that shows how to quantize a sentence-transformers model for use with OpenVINO
  • A script that shows how to evaluate a quantized model, comparing the INT8 model with the FP32 model

Basic usage

NOTE: The PR to add OpenVINO support to sentence-transformers has not been merged yet. For now, install sentence-transformers with pip install "git+https://github.com/helena-intel/sentence-transformers.git@helena/openvino-support"

To use sentence-transformers with OpenVINO, simply add backend="openvino" to the SentenceTransformers() model initialization. model_name_or_path can refer to a model_id on the Hugging Face Hub, or a path to a local directory with a compatible model. If a model_id with a PyTorch model is provided, it will be converted to OpenVINO on the fly.

model = SentenceTransformer(model_name_or_path, backend="openvino")

You can save this OpenVINO model and load it directly:

# load a model from the Hugging Face Hub and convert to OpenVINO on the fly
model = SentenceTransformer("BAAI/bge-base-en-v1.5", backend="openvino")
# save the model
model.save("bge-base-en-v.1.5-ov")
# load the saved OpenVINO model
model = SentenceTransformer("bge-base-en-v.1.5-ov", backend="openvino")

Using an OpenVINO config

To use an OpenVINO config, set ov_config in model_kwargs. ov_config can either be a dictionary with an OpenVINO config, or point to a .json file with an OpenVINO config:

model = SentenceTransformer("BAAI/bge-base-en-v1.5", backend="openvino", model_kwargs = {"ov_config": {"INFERENCE_PRECISION_HINT": "f32"})
model = SentenceTransformer("BAAI/bge-base-en-v1.5", backend="openvino", model_kwargs = {"ov_config": "ov_config.json"})

Using Intel GPU accelerator

To use an Intel iGPU or dGPU for inference, set model_kwargs["device"] to GPU:

model = SentenceTransformer("BAAI/bge-base-en-v1.5", backend="openvino", model_kwargs = {"device": "GPU"})

NOTE: do not set the device argument directly.

Quantizing

See the quantization and evaluation scripts in this gist for an example of how to quantize models for use with OpenVINO.

# pip install optimum[openvino] mteb "git+https://github.com/helena-intel/sentence-transformers.git@helena/openvino-support"
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer
model = SentenceTransformer("BAAI/bge-base-en-v1.5", backend="openvino")
tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-base-en-v1.5")
print(model.encode("hello"))
# pip install optimum[openvino] mteb sentence-transformers
# run quantize_sentencetransformers.py first, to create the OpenVINO FP32 and INT8 models.
# tested with mteb 1.18.6, sentence-transformers 3.2.1, optimum-intel 1.20.1
import gc
import json
from pathlib import Path
import datasets
import mteb
import pandas as pd
import transformers
from sentence_transformers import SentenceTransformer
transformers.logging.set_verbosity_error()
datasets.logging.set_verbosity_error()
def benchmark(model_id, output_folder, device):
base_model_path = Path(f"models/{model_id}")
fp32_model_path = base_model_path.with_name(base_model_path.name + "-ov-fp32")
int8_ptq_model_path = base_model_path.with_name(base_model_path.name + "-ov-int8-ptq")
results = {}
tasks = ["STSB", "ImdbClassification"]
tasks = ["CPUSpeedTask"]
tasks = ["Banking77Classification", "CPUSpeedTask"]
models = {"openvino_int8": int8_ptq_model_path, "openvino_fp32": fp32_model_path, "pytorch": model_id}
for model_desc, model_path in models.items():
mteb_tasks = mteb.get_tasks(tasks=tasks, languages=["eng"])
evaluation = mteb.MTEB(tasks=mteb_tasks)
if "openvino" in model_desc:
model = SentenceTransformer(str(model_path), backend="openvino", model_kwargs={"device": device})
elif model_desc == "pytorch":
if device.lower() != "gpu":
model = SentenceTransformer(str(model_path), backend="torch")
else:
# Run task on GPU only for OpenVINO models
break
results[f"accuracy_{model_desc}"] = evaluation.run(
model, overwrite_results=True, output_folder=f"{output_folder}/{device}/{model_desc}", co2_tracker=True
)
del model
gc.collect()
print(results)
### Summarize results
def summarize(output_folder):
def custom_sort_key(item):
first_columns = ["evaluation_time", "g_co2_emissions"]
if item in first_columns:
return first_columns.index(item)
else:
return 2.5
global_metrics = ["evaluation_time", "kg_co2_emissions"]
scores_metrics = {
"Banking77Classification": ["f1"],
"ImdbClassification": ["f1"],
"STSB": ["pearson"],
"CPUSpeedTask": ["avg_words_per_sec"],
}
records = []
for fn in Path(output_folder).glob("**/*.json"):
if fn.stem != "model_meta":
device = fn.parts[1]
modeltype = fn.parts[2]
task = fn.stem
with open(fn) as f:
result = json.load(f)
for score_metric in scores_metrics[task]:
metric = result["scores"]["test"][0][score_metric]
records.append([modeltype, device, task, score_metric, round(metric, 3)])
for global_metric in global_metrics:
metric = result[global_metric]
if global_metric == "kg_co2_emissions":
metric *= 1000
global_metric = "g_co2_emissions"
records.append([modeltype, device, task, global_metric, round(metric, 3)])
df = pd.DataFrame.from_records(records, columns=["modeltype", "device", "task", "metric", "value"])
for task in df.task.unique():
subdf = df[df.task == task]
pivot = subdf.pivot_table(index=["task", "device", "modeltype"], columns="metric", values="value")
pivot.columns = sorted(pivot.columns.tolist(), key=custom_sort_key)
print(pivot)
if __name__ == "__main__":
model_id = "BAAI/bge-base-en-v1.5"
output_folder = "mteb_results"
for device in ("CPU", "GPU"):
benchmark(model_id, output_folder, device)
summarize(output_folder)
# pip install optimum[openvino]
from functools import partial
from optimum.intel.openvino import OVConfig, OVModelForFeatureExtraction, OVQuantizationConfig, OVQuantizer
from transformers import AutoTokenizer, AutoConfig
MODEL_ID = "BAAI/bge-base-en-v1.5"
DATASET_NAME = "sentence-transformers/sentence-compression"
DATASET_COLUMN = "text"
INT8_SAVE_DIR = f"models/{MODEL_ID}-ov-int8-ptq"
FP32_SAVE_DIR = f"models/{MODEL_ID}-ov-fp32"
def preprocess_function(examples, tokenizer):
"""convert the text from the dataset into tokens in the format that the model expects"""
config = AutoConfig.from_pretrained(MODEL_ID)
return tokenizer(
examples[DATASET_COLUMN],
padding="max_length",
max_length=config.max_position_embeddings,
truncation=True,
return_tensors="pt",
)
model = OVModelForFeatureExtraction.from_pretrained(MODEL_ID, export=True)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model.save_pretrained(FP32_SAVE_DIR)
tokenizer.save_pretrained(FP32_SAVE_DIR)
quantizer = OVQuantizer.from_pretrained(model)
ov_config = OVConfig(quantization_config=OVQuantizationConfig())
calibration_dataset = quantizer.get_calibration_dataset(
DATASET_NAME,
preprocess_function=partial(preprocess_function, tokenizer=tokenizer),
num_samples=300,
dataset_split="train",
)
quantizer.quantize(ov_config=ov_config, calibration_dataset=calibration_dataset, save_directory=INT8_SAVE_DIR)
tokenizer.save_pretrained(INT8_SAVE_DIR)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment