环境信息构建:
#确定cuda和onnxruntime的版本对应关系:https://blog.51cto.com/u_15962038/12360310conda create -n mypython311 python=3.11.4 cudnn=8.9.2.26 cudatoolkit=11.8.0
pip install torch==1.12.1+cu113 torchvision==0.13.1+cu113 torchaudio==0.12.1 --extra-index-url https://download.pytorch.org/whl/cu118
pip install onnxruntime-gpu==1.18.1 onnx==1.17.0 numpy==1.26.4 transformers==4.46.2 triton==2.0.0
模型转换:
import os
import torch
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers.onnx.features import FeaturesManager
BCE_MODEL_PATH='/path/bce-embedding-base_v1/'
tokenizer = AutoTokenizer.from_pretrained(BCE_MODEL_PATH)
config = AutoConfig.from_pretrained(BCE_MODEL_PATH)
bge_model = AutoModel.from_pretrained(BCE_MODEL_PATH)
bge_model.eval()
onnx_config = FeaturesManager.get_config("bert", "default")(config)
dummy_inputs = onnx_config.generate_dummy_inputs(tokenizer, framework='pt')
base_path="/path/output_model/bce-embedding-base_v1"
output_onnx_path = os.path.join(base_path, "bce-embedding-base_v1.onnx")
config_file = os.path.join(base_path, "bce-embedding-base_v1_config.json")
dynamic_axes = {'input_ids': {0: 'batch_size', 1: 'sequence_length'}, # 动态 batch size 和 序列长度'attention_mask': {0: 'batch_size', 1: 'sequence_length'}, # 动态 batch size 和 序列长度'token_type_ids': {0: 'batch_size', 1: 'sequence_length'},'output': {0: 'batch_size'} # 动态 batch size
}
import json
#注意模型输出时,对应的onnx的模型输出要和output_names对应
with open(config_file, 'w', encoding='utf8') as fout:json.dump({"input_names": list(onnx_config.inputs.keys()),"output_names": list(onnx_config.outputs.keys())}, fout, ensure_ascii=False, indent=4)
torch.onnx.export(bge_model, (dummy_inputs,), f=output_onnx_path,input_names=list(onnx_config.inputs.keys()),output_names=list(onnx_config.outputs.keys()),dynamic_axes=dynamic_axes, # 设置动态轴# do_constant_folding=True,# use_external_data_format=False, # 根据模型大小决定是否使用外部数据格式# enable_onnx_checker=True,opset_version=14,
)
onnx模型加载推理
embedding_backend.py
from typing import List
from transformers import AutoTokenizer
import concurrent.futures
from tqdm import tqdm
from abc import ABC, abstractmethodLOCAL_EMBED_PATH="飞onnx的原始模型路径,加载tokenize相关文件"
LOCAL_EMBED_WORKERS=3
LOCAL_EMBED_BATCH=1
class EmbeddingBackend(ABC):
embed_version = "local_v0.0.1_20230525_6d4019f1559aef84abc2ab8257e1ad4c"def __init__(self, use_cpu: bool = False):
self.use_cpu = use_cpu
self._tokenizer = AutoTokenizer.from_pretrained(LOCAL_EMBED_PATH)
self.workers = LOCAL_EMBED_WORKERS@abstractmethod
def get_embedding(self, sentences, max_length) -> List:
pass
def get_len_safe_embeddings(self, texts: List[str]) -> List[List[float]]:
all_embeddings = []
batch_size = LOCAL_EMBED_BATCHwith concurrent.futures.ThreadPoolExecutor(max_workers=self.workers) as executor:
futures = []
for i in range(0, len(texts), batch_size):
batch = texts[i:i + batch_size]
future = executor.submit(self.get_embedding, batch, LOCAL_EMBED_MAX_LENGTH)
futures.append(future)
for future in tqdm(futures):
embeddings = future.result()
all_embeddings += embeddings
return all_embeddingsdef embed_documents(self, texts: List[str]) -> List[List[float]]:
"""Embed search docs using multithreading, maintaining the original order."""
return self.get_len_safe_embeddings(texts)def embed_query(self, text: str) -> List[float]:
"""Embed query text."""
return self.embed_documents([text])[0]@property
def getModelVersion(self):
return self.embed_version
模型加载:
import os
import numpy as np
import time
from onnxruntime import InferenceSession, SessionOptions, GraphOptimizationLevel
from embedding_backend import EmbeddingBackendLOCAL_EMBED_MODEL_PATH="onnx_model_path/bce-embedding-base_v1.onnx"
class EmbeddingOnnxBackend(EmbeddingBackend):def __init__(self, use_cpu: bool = False):super().__init__(use_cpu)self.return_tensors = "np"sess_options = SessionOptions()sess_options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALLif use_cpu:providers = ['CPUExecutionProvider']else:providers = ['CUDAExecutionProvider', 'CPUExecutionProvider']self._session = InferenceSession(LOCAL_EMBED_MODEL_PATH, sess_options=sess_options, providers=providers)def get_embedding(self, sentences, max_length):inputs_onnx = self._tokenizer(sentences, padding=True, truncation=True, max_length=max_length, return_tensors=self.return_tensors)inputs_onnx = {k: v for k, v in inputs_onnx.items()}start_time = time.time()
#output_names的值和模型转换时保存的bce-embedding-base_v1_config.json中的output_names的值一致outputs_onnx = self._session.run(output_names=['last_hidden_state'], input_feed=inputs_onnx)debug_logger.info(f"onnx infer time: {time.time() - start_time}")embedding = outputs_onnx[0][:,0]norm_arr = np.linalg.norm(embedding, axis=1, keepdims=True)embeddings_normalized = embedding / norm_arrreturn embeddings_normalized.tolist()
待验证部分:onnx模型压缩:GitHub - daquexian/onnx-simplifier: Simplify your onnx model