add read me
This commit is contained in:
@@ -0,0 +1,2 @@
|
||||
# from .base_operator import QuantOperatorBase
|
||||
# from .matmul import MatMulInteger
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,119 @@
|
||||
import onnx
|
||||
|
||||
from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
|
||||
from .base_operator import QuantOperatorBase
|
||||
from .qdq_base_operator import QDQOperatorBase
|
||||
|
||||
|
||||
class QLinearActivation(QuantOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def QuantizeClipRelu(self): # noqa: N802
|
||||
node = self.node
|
||||
assert node.op_type == "Relu" or node.op_type == "Clip"
|
||||
|
||||
# When mode is QLinearOps, the output quantization params are calculated based on outputs from
|
||||
# activation nodes, therefore these nodes can be removed from the graph if they follow a quantized op.
|
||||
# If input to this node is not quantized then keep this node
|
||||
# If activation is symmetric, not quantize the op and simply return
|
||||
if node.input[0] not in self.quantizer.quantized_value_map or self.quantizer.is_activation_symmetric:
|
||||
return super().quantize()
|
||||
|
||||
quantized_value = self.quantizer.quantized_value_map[node.input[0]]
|
||||
self.quantizer.quantized_value_map[node.output[0]] = quantized_value
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
if node.op_type == "Relu" or node.op_type == "Clip":
|
||||
self.QuantizeClipRelu()
|
||||
return
|
||||
|
||||
nnapi_sigmoid_option = "extra.Sigmoid.nnapi"
|
||||
sigmoid_nnapi_mode = (
|
||||
node.op_type == "Sigmoid"
|
||||
and nnapi_sigmoid_option in self.quantizer.extra_options
|
||||
and self.quantizer.extra_options[nnapi_sigmoid_option]
|
||||
)
|
||||
use_scale = 1 / 256.0 if sigmoid_nnapi_mode else None
|
||||
use_zeropoint = 0 if sigmoid_nnapi_mode else None
|
||||
|
||||
# No assert on op_type as it is controlled by registry
|
||||
# only try to quantize when given quantization parameters for it
|
||||
(
|
||||
data_found,
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
_,
|
||||
_,
|
||||
) = self.quantizer._get_quantization_params(node.output[0], use_scale, use_zeropoint)
|
||||
(
|
||||
quantized_input_names,
|
||||
zero_point_names,
|
||||
scale_names,
|
||||
nodes,
|
||||
) = self.quantizer.quantize_activation(node, [0])
|
||||
if not data_found or quantized_input_names is None:
|
||||
return super().quantize()
|
||||
|
||||
qlinear_activation_output = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
|
||||
qlinear_activation_name = ""
|
||||
if node.name:
|
||||
qlinear_activation_name = node.name + "_quant"
|
||||
kwargs = {}
|
||||
for attribute in node.attribute:
|
||||
kwargs.update(attribute_to_kwarg(attribute))
|
||||
kwargs["domain"] = ms_domain
|
||||
|
||||
qlinear_activation_inputs = [
|
||||
quantized_input_names[0],
|
||||
scale_names[0],
|
||||
zero_point_names[0],
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
]
|
||||
|
||||
qlinear_activation_node = onnx.helper.make_node(
|
||||
"QLinear" + node.op_type,
|
||||
qlinear_activation_inputs,
|
||||
[qlinear_activation_output],
|
||||
qlinear_activation_name,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
# Create an entry for this quantized value
|
||||
q_output = QuantizedValue(
|
||||
node.output[0],
|
||||
qlinear_activation_output,
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
QuantizedValueType.Input,
|
||||
)
|
||||
self.quantizer.quantized_value_map[node.output[0]] = q_output
|
||||
|
||||
nodes.append(qlinear_activation_node)
|
||||
self.quantizer.new_nodes += nodes
|
||||
|
||||
|
||||
class QDQRemovableActivation(QDQOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
|
||||
# If input to this node is not quantized then keep this node
|
||||
if not self.quantizer.is_tensor_quantized(node.input[0]):
|
||||
return
|
||||
|
||||
if (
|
||||
not self.quantizer.is_activation_symmetric
|
||||
and not self.quantizer.qdq_keep_removable_activations
|
||||
and self.quantizer.try_replacing_upstream_output(node.input[0], node.output[0])
|
||||
):
|
||||
self.quantizer.remove_node(self.node)
|
||||
else:
|
||||
self.quantizer.quantize_activation_tensor(node.input[0])
|
||||
|
||||
if not self.disable_qdq_for_node_output:
|
||||
self.quantizer.quantize_activation_tensor(node.output[0])
|
||||
@@ -0,0 +1,18 @@
|
||||
from .base_operator import QuantOperatorBase
|
||||
|
||||
|
||||
# Use the quantized tensor as input without DQ.
|
||||
class QArgMax(QuantOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
|
||||
quantized_input_value = self.quantizer.find_quantized_value(node.input[0])
|
||||
if quantized_input_value is None:
|
||||
self.quantizer.new_nodes += [node]
|
||||
return
|
||||
|
||||
node.input[0] = quantized_input_value.q_name
|
||||
self.quantizer.new_nodes += [node]
|
||||
@@ -0,0 +1,73 @@
|
||||
import onnx
|
||||
from onnx import onnx_pb as onnx_proto # noqa: F401
|
||||
|
||||
from ..quant_utils import attribute_to_kwarg, ms_domain
|
||||
from .base_operator import QuantOperatorBase
|
||||
|
||||
"""
|
||||
Quantize Attention
|
||||
"""
|
||||
|
||||
|
||||
class AttentionQuant(QuantOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def should_quantize(self):
|
||||
return self.quantizer.should_quantize_node(self.node)
|
||||
|
||||
def quantize(self):
|
||||
"""
|
||||
parameter node: Attention node.
|
||||
parameter new_nodes_list: List of new nodes created before processing this node.
|
||||
return: a list of nodes in topological order that represents quantized Attention node.
|
||||
"""
|
||||
node = self.node
|
||||
assert node.op_type == "Attention"
|
||||
|
||||
# TODO This is a temporary fix to stop exporting QAttention with qkv_hidden_sizes
|
||||
# attribute. This needs to be removed once the QAttention for varied q,k,v sizes
|
||||
# is implemented
|
||||
for attr in node.attribute:
|
||||
if attr.name == "qkv_hidden_sizes":
|
||||
return super().quantize()
|
||||
|
||||
(
|
||||
quantized_input_names,
|
||||
zero_point_names,
|
||||
scale_names,
|
||||
nodes,
|
||||
) = self.quantizer.quantize_activation(node, [0])
|
||||
|
||||
(
|
||||
quantized_input_names_weight,
|
||||
zero_point_names_weight,
|
||||
scale_names_weight,
|
||||
nodes_weight,
|
||||
) = self.quantizer.quantize_weight(node, [1], reduce_range=True, op_level_per_channel=True)
|
||||
quantized_input_names.extend(quantized_input_names_weight)
|
||||
zero_point_names.extend(zero_point_names_weight)
|
||||
scale_names.extend(scale_names_weight)
|
||||
nodes.extend(nodes_weight)
|
||||
|
||||
if quantized_input_names is None:
|
||||
return super().quantize()
|
||||
|
||||
qattention_name = "" if not node.name else node.name + "_quant"
|
||||
|
||||
inputs = []
|
||||
inputs.extend(quantized_input_names)
|
||||
inputs.extend([node.input[2]])
|
||||
inputs.extend(scale_names)
|
||||
inputs.extend([node.input[3] if len(node.input) > 3 else ""])
|
||||
inputs.extend(zero_point_names)
|
||||
inputs.extend([node.input[4] if len(node.input) > 4 else ""])
|
||||
|
||||
kwargs = {}
|
||||
for attribute in node.attribute:
|
||||
kwargs.update(attribute_to_kwarg(attribute))
|
||||
kwargs["domain"] = ms_domain
|
||||
qattention_node = onnx.helper.make_node("QAttention", inputs, node.output, qattention_name, **kwargs)
|
||||
nodes.append(qattention_node)
|
||||
|
||||
self.quantizer.new_nodes += nodes
|
||||
@@ -0,0 +1,26 @@
|
||||
class QuantOperatorBase:
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
self.quantizer = onnx_quantizer
|
||||
self.node = onnx_node
|
||||
|
||||
def should_quantize(self):
|
||||
if not self.quantizer.should_quantize_node(self.node):
|
||||
return False
|
||||
|
||||
return self.quantizer.is_float_tensor(self.node.input[0])
|
||||
|
||||
def quantize(self):
|
||||
"""
|
||||
Given a node which does not support quantization, this method checks whether the input to
|
||||
this node is quantized and adds a DequantizeLinear node to dequantize this input back to FP32
|
||||
parameter node: Current node
|
||||
parameter new_nodes_list: List of new nodes created before processing current node
|
||||
return: List of new nodes created
|
||||
"""
|
||||
for _, node_input in enumerate(self.node.input):
|
||||
dequantize_node = self.quantizer._dequantize_value(node_input)
|
||||
if dequantize_node is not None:
|
||||
self.quantizer.new_nodes.append(dequantize_node)
|
||||
|
||||
# Append the original node
|
||||
self.quantizer.new_nodes.append(self.node)
|
||||
@@ -0,0 +1,72 @@
|
||||
import onnx
|
||||
from onnx import onnx_pb as onnx_proto # noqa: F401
|
||||
|
||||
from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
|
||||
from .base_operator import QuantOperatorBase
|
||||
|
||||
|
||||
class QLinearBinaryOp(QuantOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
|
||||
(
|
||||
data_found,
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
_,
|
||||
_,
|
||||
) = self.quantizer._get_quantization_params(node.output[0])
|
||||
(
|
||||
quantized_input_names,
|
||||
zero_point_names,
|
||||
scale_names,
|
||||
nodes,
|
||||
) = self.quantizer.quantize_activation(node, [0, 1])
|
||||
if not data_found or quantized_input_names is None:
|
||||
return super().quantize()
|
||||
|
||||
qlinear_binary_math_output = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
|
||||
qlinear_binary_math_name = node.name + "_quant" if node.name else ""
|
||||
|
||||
kwargs = {}
|
||||
for attribute in node.attribute:
|
||||
kwargs.update(attribute_to_kwarg(attribute))
|
||||
kwargs["domain"] = ms_domain
|
||||
|
||||
qlinear_binary_math_inputs = []
|
||||
# Input 0
|
||||
qlinear_binary_math_inputs.append(quantized_input_names[0])
|
||||
qlinear_binary_math_inputs.append(scale_names[0])
|
||||
qlinear_binary_math_inputs.append(zero_point_names[0])
|
||||
# Input 1
|
||||
qlinear_binary_math_inputs.append(quantized_input_names[1])
|
||||
qlinear_binary_math_inputs.append(scale_names[1])
|
||||
qlinear_binary_math_inputs.append(zero_point_names[1])
|
||||
|
||||
# Output
|
||||
qlinear_binary_math_inputs.append(output_scale_name)
|
||||
qlinear_binary_math_inputs.append(output_zp_name)
|
||||
|
||||
qlinear_binary_math_node = onnx.helper.make_node(
|
||||
"QLinear" + node.op_type,
|
||||
qlinear_binary_math_inputs,
|
||||
[qlinear_binary_math_output],
|
||||
qlinear_binary_math_name,
|
||||
**kwargs,
|
||||
)
|
||||
nodes.append(qlinear_binary_math_node)
|
||||
|
||||
# Create an entry for this quantized value
|
||||
q_output = QuantizedValue(
|
||||
node.output[0],
|
||||
qlinear_binary_math_output,
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
QuantizedValueType.Input,
|
||||
)
|
||||
self.quantizer.quantized_value_map[node.output[0]] = q_output
|
||||
|
||||
self.quantizer.new_nodes += nodes
|
||||
@@ -0,0 +1,62 @@
|
||||
import onnx
|
||||
|
||||
from ..quant_utils import ( # noqa: F401
|
||||
TENSOR_NAME_QUANT_SUFFIX,
|
||||
QuantizedValue,
|
||||
QuantizedValueType,
|
||||
attribute_to_kwarg,
|
||||
ms_domain,
|
||||
)
|
||||
from .base_operator import QuantOperatorBase
|
||||
from .qdq_base_operator import QDQOperatorBase # noqa: F401
|
||||
|
||||
|
||||
class QLinearConcat(QuantOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
|
||||
(
|
||||
data_found,
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
_,
|
||||
_,
|
||||
) = self.quantizer._get_quantization_params(node.output[0])
|
||||
(
|
||||
q_input_names,
|
||||
zero_point_names,
|
||||
scale_names,
|
||||
nodes,
|
||||
) = self.quantizer.quantize_activation(node, [*range(len(node.input))])
|
||||
if not data_found or q_input_names is None:
|
||||
return super().quantize()
|
||||
|
||||
# Create an entry for output quantized value
|
||||
quantized_input_value = self.quantizer.quantized_value_map[node.input[0]]
|
||||
quantized_output_value = QuantizedValue(
|
||||
node.output[0],
|
||||
node.output[0] + TENSOR_NAME_QUANT_SUFFIX,
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
quantized_input_value.value_type,
|
||||
)
|
||||
self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
|
||||
|
||||
kwargs = {}
|
||||
for attribute in node.attribute:
|
||||
kwargs.update(attribute_to_kwarg(attribute))
|
||||
kwargs["domain"] = ms_domain
|
||||
qnode_name = node.name + "_quant" if node.name else ""
|
||||
|
||||
qlconcat_inputs = [output_scale_name, output_zp_name]
|
||||
for i in range(len(q_input_names)):
|
||||
qlconcat_inputs.extend([q_input_names[i], scale_names[i], zero_point_names[i]])
|
||||
qlconcat_node = onnx.helper.make_node(
|
||||
"QLinearConcat", qlconcat_inputs, [quantized_output_value.q_name], qnode_name, **kwargs
|
||||
)
|
||||
|
||||
self.quantizer.new_nodes += nodes
|
||||
self.quantizer.new_nodes += [qlconcat_node]
|
||||
@@ -0,0 +1,260 @@
|
||||
import numpy as np
|
||||
import onnx
|
||||
from onnx import onnx_pb as onnx_proto
|
||||
|
||||
from ..quant_utils import (
|
||||
TENSOR_NAME_QUANT_SUFFIX,
|
||||
QuantizedValue,
|
||||
QuantizedValueType,
|
||||
attribute_to_kwarg,
|
||||
find_by_name,
|
||||
get_mul_node,
|
||||
)
|
||||
from .base_operator import QuantOperatorBase
|
||||
from .qdq_base_operator import QDQOperatorBase
|
||||
|
||||
|
||||
class ConvInteger(QuantOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def add_bias(self, nodes, scaled_output):
|
||||
"""
|
||||
Given a node, this function handles bias add by adding a "reshape" node on bias and an "add" node
|
||||
parameter nodes: new nodes would be appended into nodes
|
||||
parameter node: current node (Conv)
|
||||
parameter scaled_output: output of quant conv without bias
|
||||
parameter output: output of Conv
|
||||
parameter bias_name: bias of Conv
|
||||
return: the name of output
|
||||
"""
|
||||
node = self.node
|
||||
model = self.quantizer.model
|
||||
# Add tensors for the shape to be reshaped to
|
||||
weight = find_by_name(node.input[1], model.initializer())
|
||||
if weight is None:
|
||||
raise ValueError(f"Expected {node.input[1]} to be an initializer")
|
||||
|
||||
# Add reshape for correct broadcase
|
||||
output = node.output[0]
|
||||
reshape_input_data = node.input[2] # bias of Conv
|
||||
reshape_input_shape = output + "_bias_reshape_shape"
|
||||
reshape_output = output + "_bias_reshape_output"
|
||||
|
||||
shape = np.ones((len(weight.dims)), dtype=np.int64)
|
||||
shape[1] = -1
|
||||
init_shape = onnx.helper.make_tensor(
|
||||
reshape_input_shape, onnx_proto.TensorProto.INT64, [len(weight.dims)], shape
|
||||
)
|
||||
model.add_initializer(init_shape)
|
||||
|
||||
reshape_node = onnx.helper.make_node("Reshape", [reshape_input_data, reshape_input_shape], [reshape_output])
|
||||
nodes.append(reshape_node)
|
||||
|
||||
# Add an Add operation for bias
|
||||
add_node = onnx.helper.make_node("Add", [scaled_output, reshape_output], [output], output + "_bias_add")
|
||||
nodes.append(add_node)
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
assert node.op_type == "Conv"
|
||||
# Get Quantized from both activation(input[0]) and weight(input[1])
|
||||
(
|
||||
quantized_input_names,
|
||||
zero_point_names,
|
||||
scale_names,
|
||||
nodes,
|
||||
) = self.quantizer.quantize_activation(node, [0])
|
||||
|
||||
(
|
||||
quantized_input_names_weight,
|
||||
zero_point_names_weight,
|
||||
scale_names_weight,
|
||||
nodes_weight,
|
||||
) = self.quantizer.quantize_weight(node, [1], reduce_range=self.quantizer.reduce_range)
|
||||
quantized_input_names.extend(quantized_input_names_weight)
|
||||
zero_point_names.extend(zero_point_names_weight)
|
||||
scale_names.extend(scale_names_weight)
|
||||
nodes.extend(nodes_weight)
|
||||
|
||||
conv_integer_output = node.output[0] + "_output_quantized"
|
||||
conv_integer_name = node.name + "_quant" if node.name else ""
|
||||
|
||||
kwargs = {}
|
||||
for attribute in node.attribute:
|
||||
kwargs.update(attribute_to_kwarg(attribute))
|
||||
conv_integer_node = onnx.helper.make_node(
|
||||
"ConvInteger", quantized_input_names + zero_point_names, [conv_integer_output], conv_integer_name, **kwargs
|
||||
)
|
||||
nodes.append(conv_integer_node)
|
||||
|
||||
# Add cast operation to cast convInteger output to float.
|
||||
onnx_type = self.quantizer.get_tensor_type(node.output[0], mandatory=True)
|
||||
cast_op_output = conv_integer_output + "_cast_output"
|
||||
cast_node = onnx.helper.make_node(
|
||||
"Cast",
|
||||
[conv_integer_output],
|
||||
[cast_op_output],
|
||||
conv_integer_output + "_cast",
|
||||
to=onnx_type, # TODO: FLOAT ot FLOAT16
|
||||
)
|
||||
nodes.append(cast_node)
|
||||
|
||||
# Add mul operation to multiply scales of two inputs.
|
||||
assert len(scale_names) == 2
|
||||
if conv_integer_name:
|
||||
scales_mul_op = conv_integer_name + "_scales_mul"
|
||||
else:
|
||||
scales_mul_op = scale_names[0] + "_" + scale_names[1] + "_mul"
|
||||
|
||||
scales_mul_node = find_by_name(scales_mul_op, self.quantizer.new_nodes)
|
||||
if scales_mul_node is None:
|
||||
scales_mul_node = get_mul_node(scale_names, scales_mul_op + ":0", scales_mul_op)
|
||||
nodes.append(scales_mul_node)
|
||||
|
||||
scales_mul_op_output = scales_mul_node.output[0]
|
||||
|
||||
has_bias = len(node.input) == 3
|
||||
scaled_output_name = node.output[0] if not has_bias else node.output[0] + "quant_scaled_output"
|
||||
|
||||
# Add mul operation to multiply mul_scales_op result with output of ConvInteger
|
||||
# and make the output of this node the same as output of original conv node.
|
||||
output_scale_mul_op = conv_integer_name + "_output_scale_mul" if conv_integer_name else ""
|
||||
nodes.append(
|
||||
get_mul_node(
|
||||
[cast_op_output, scales_mul_op_output],
|
||||
scaled_output_name,
|
||||
output_scale_mul_op,
|
||||
)
|
||||
)
|
||||
|
||||
if has_bias:
|
||||
self.add_bias(nodes, scaled_output_name)
|
||||
|
||||
self.quantizer.new_nodes += nodes
|
||||
|
||||
|
||||
class QLinearConv(QuantOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
assert node.op_type == "Conv"
|
||||
|
||||
(
|
||||
data_found,
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
_,
|
||||
_,
|
||||
) = self.quantizer._get_quantization_params(node.output[0])
|
||||
|
||||
if self.quantizer.is_input_a_initializer(node.input[1]) and self.quantizer.is_per_channel():
|
||||
(
|
||||
quantized_input_names,
|
||||
zero_point_names,
|
||||
scale_names,
|
||||
nodes,
|
||||
) = self.quantizer.quantize_activation(node, [0])
|
||||
quant_weight_tuple = self.quantizer.quantize_weight_per_channel(
|
||||
node.input[1],
|
||||
onnx_proto.TensorProto.INT8,
|
||||
0, # self.quantizer.weight_qType?
|
||||
)
|
||||
quantized_input_names.append(quant_weight_tuple[0])
|
||||
zero_point_names.append(quant_weight_tuple[1])
|
||||
scale_names.append(quant_weight_tuple[2])
|
||||
else:
|
||||
(
|
||||
quantized_input_names,
|
||||
zero_point_names,
|
||||
scale_names,
|
||||
nodes,
|
||||
) = self.quantizer.quantize_activation(node, [0])
|
||||
|
||||
(
|
||||
quantized_input_names_weight,
|
||||
zero_point_names_weight,
|
||||
scale_names_weight,
|
||||
nodes_weight,
|
||||
) = self.quantizer.quantize_weight(node, [1], reduce_range=self.quantizer.reduce_range)
|
||||
quantized_input_names.extend(quantized_input_names_weight)
|
||||
zero_point_names.extend(zero_point_names_weight)
|
||||
scale_names.extend(scale_names_weight)
|
||||
nodes.extend(nodes_weight)
|
||||
|
||||
if not data_found or quantized_input_names is None:
|
||||
return super().quantize()
|
||||
|
||||
quantized_bias_name = ""
|
||||
bias_present = False
|
||||
if len(node.input) == 3:
|
||||
if self.quantizer.weight_qType == onnx_proto.TensorProto.FLOAT8E4M3FN:
|
||||
raise RuntimeError("Quantization to FLOAT8E4M3FN for operator Conv is not supported.")
|
||||
quantized_bias_name = self.quantizer.quantize_bias_static(node.input[2], node.input[0], node.input[1])
|
||||
bias_present = True
|
||||
|
||||
qlinear_conv_output = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
|
||||
qlinear_conv_name = node.name + "_quant" if node.name else ""
|
||||
|
||||
kwargs = {}
|
||||
for attribute in node.attribute:
|
||||
kwargs.update(attribute_to_kwarg(attribute))
|
||||
qlinear_conv_inputs = []
|
||||
# Input 0
|
||||
qlinear_conv_inputs.append(quantized_input_names[0])
|
||||
qlinear_conv_inputs.append(scale_names[0])
|
||||
qlinear_conv_inputs.append(zero_point_names[0])
|
||||
# Input 1
|
||||
qlinear_conv_inputs.append(quantized_input_names[1])
|
||||
qlinear_conv_inputs.append(scale_names[1])
|
||||
qlinear_conv_inputs.append(zero_point_names[1])
|
||||
|
||||
# Output
|
||||
qlinear_conv_inputs.append(output_scale_name)
|
||||
qlinear_conv_inputs.append(output_zp_name)
|
||||
|
||||
if bias_present:
|
||||
qlinear_conv_inputs.append(quantized_bias_name)
|
||||
|
||||
qlinear_conv_node = onnx.helper.make_node(
|
||||
"QLinearConv", qlinear_conv_inputs, [qlinear_conv_output], qlinear_conv_name, **kwargs
|
||||
)
|
||||
nodes.append(qlinear_conv_node)
|
||||
|
||||
# Create an entry for this quantized value
|
||||
q_output = QuantizedValue(
|
||||
node.output[0],
|
||||
qlinear_conv_output,
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
QuantizedValueType.Input,
|
||||
)
|
||||
self.quantizer.quantized_value_map[node.output[0]] = q_output
|
||||
|
||||
self.quantizer.new_nodes += nodes
|
||||
|
||||
|
||||
class QDQConv(QDQOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
assert node.op_type == "Conv" or node.op_type == "ConvTranspose"
|
||||
|
||||
self.quantizer.quantize_activation_tensor(node.input[0])
|
||||
if not self.disable_qdq_for_node_output:
|
||||
self.quantizer.quantize_activation_tensor(node.output[0])
|
||||
|
||||
is_weight_per_channel, weight_axis = self.quantizer.is_tensor_per_channel(
|
||||
node.input[1], default_axis=0 if node.op_type == "Conv" else 1
|
||||
)
|
||||
if is_weight_per_channel:
|
||||
self.quantizer.quantize_weight_tensor_per_channel(node.input[1], weight_axis)
|
||||
else:
|
||||
self.quantizer.quantize_weight_tensor(node.input[1])
|
||||
|
||||
if len(node.input) == 3:
|
||||
self.quantizer.quantize_bias_tensor(node.name, node.input[2], node.input[0], node.input[1])
|
||||
@@ -0,0 +1,78 @@
|
||||
from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType
|
||||
from .base_operator import QuantOperatorBase
|
||||
from .qdq_base_operator import QDQOperatorBase
|
||||
|
||||
|
||||
# For operators that support 8bits operations directly, and output could
|
||||
# reuse input[0]'s type, zeropoint, scale; For example,Transpose, Reshape, etc.
|
||||
class Direct8BitOp(QuantOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
|
||||
if not self.quantizer.force_quantize_no_input_check:
|
||||
# Keep backward compatibility
|
||||
# Quantize when input[0] is quantized already. Otherwise keep it.
|
||||
quantized_input_value = self.quantizer.find_quantized_value(node.input[0])
|
||||
if quantized_input_value is None:
|
||||
self.quantizer.new_nodes += [node]
|
||||
return
|
||||
|
||||
quantized_output_value = QuantizedValue(
|
||||
node.output[0],
|
||||
node.output[0] + TENSOR_NAME_QUANT_SUFFIX,
|
||||
quantized_input_value.scale_name,
|
||||
quantized_input_value.zp_name,
|
||||
quantized_input_value.value_type,
|
||||
)
|
||||
self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
|
||||
|
||||
node.input[0] = quantized_input_value.q_name
|
||||
node.output[0] = quantized_output_value.q_name
|
||||
self.quantizer.new_nodes += [node]
|
||||
|
||||
else:
|
||||
# Force quantize those ops if possible, use exclude node list if this is not you want
|
||||
if not self.quantizer.is_valid_quantize_weight(node.input[0]):
|
||||
super().quantize()
|
||||
return
|
||||
|
||||
(
|
||||
quantized_input_names,
|
||||
zero_point_names,
|
||||
scale_names,
|
||||
nodes,
|
||||
) = self.quantizer.quantize_activation(node, [0])
|
||||
if quantized_input_names is None:
|
||||
return super().quantize()
|
||||
|
||||
# Create an entry for output quantized value
|
||||
quantized_output_value = QuantizedValue(
|
||||
node.output[0],
|
||||
node.output[0] + TENSOR_NAME_QUANT_SUFFIX,
|
||||
scale_names[0],
|
||||
zero_point_names[0],
|
||||
QuantizedValueType.Input,
|
||||
)
|
||||
self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
|
||||
|
||||
node.input[0] = quantized_input_names[0]
|
||||
node.output[0] = quantized_output_value.q_name
|
||||
nodes.append(node)
|
||||
|
||||
self.quantizer.new_nodes += nodes
|
||||
|
||||
|
||||
class QDQDirect8BitOp(QDQOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def quantize(self):
|
||||
if self.quantizer.force_quantize_no_input_check:
|
||||
self.quantizer.quantize_activation_tensor(self.node.input[0])
|
||||
if not self.disable_qdq_for_node_output:
|
||||
self.quantizer.quantize_output_same_as_input(self.node.output[0], self.node.input[0], self.node.name)
|
||||
elif self.quantizer.is_tensor_quantized(self.node.input[0]) and not self.disable_qdq_for_node_output:
|
||||
self.quantizer.quantize_output_same_as_input(self.node.output[0], self.node.input[0], self.node.name)
|
||||
@@ -0,0 +1,121 @@
|
||||
import logging
|
||||
|
||||
import onnx
|
||||
from onnx import onnx_pb as onnx_proto # noqa: F401
|
||||
|
||||
from ..quant_utils import attribute_to_kwarg, ms_domain
|
||||
from .base_operator import QuantOperatorBase
|
||||
|
||||
"""
|
||||
Quantizes the EmbedLayerNorm fused ONNXRuntime Op.
|
||||
|
||||
This Quant operator keeps the input and segment IDs at int32 but will quantize all initializer and
|
||||
weight inputs associated with the node to uint8.
|
||||
"""
|
||||
|
||||
|
||||
class EmbedLayerNormalizationQuant(QuantOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def should_quantize(self):
|
||||
return self.quantizer.should_quantize_node(self.node)
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
assert node.op_type == "EmbedLayerNormalization"
|
||||
|
||||
if len(node.output) > 2:
|
||||
logging.info(f"Quantization is not applied to {node.name} since it has 3 outputs")
|
||||
return super().quantize()
|
||||
|
||||
"""
|
||||
Pre-quantization EmbedLayerNorm inputs:
|
||||
[0] input_ids (int32)
|
||||
[1] segment_ids (int32)
|
||||
[2] word_embedding (float32)
|
||||
[3] position_embedding (float32)
|
||||
[4] segment_embedding (float32)
|
||||
[5] gamma (float32)
|
||||
[6] beta (float32)
|
||||
[7] mask (int32) (optional)
|
||||
"""
|
||||
(
|
||||
quantized_input_names,
|
||||
zero_point_names,
|
||||
scale_names,
|
||||
nodes,
|
||||
) = self.quantizer.quantize_activation(node, [2, 3, 4, 5, 6])
|
||||
if quantized_input_names is None:
|
||||
return super().quantize()
|
||||
|
||||
qembed_layer_norm_name = "" if not node.name else node.name + "_quant"
|
||||
|
||||
"""
|
||||
Quantized Input Tensor List
|
||||
[0] input_ids (int32)
|
||||
[1] segment_ids (int32)
|
||||
[2] word_embedding (uint8)
|
||||
[3] position_embedding (uint8)
|
||||
[4] segment_embedding (uint8)
|
||||
[5] gamma (uint8)
|
||||
[6] beta (uint8)
|
||||
[7] mask (int32) (optional)
|
||||
[8] word_embedding_scale (float)
|
||||
[9] position_embedding_scale (float)
|
||||
[10] segment_embedding_scale (float)
|
||||
[11] gamma_scale (float)
|
||||
[12] beta_scale (float)
|
||||
[13] word_embedding_zero_point (uint8)
|
||||
[14] position_embedding_zero_point (uint8)
|
||||
[15] segment_embedding_zero_point (uint8)
|
||||
[16] gamma_zero_point (uint8)
|
||||
[17] beta_zero_point (uint8)
|
||||
"""
|
||||
inputs = []
|
||||
# 'input_ids'
|
||||
inputs.extend([node.input[0]])
|
||||
# 'segment_ids'
|
||||
inputs.extend([node.input[1]])
|
||||
# 'word_embedding_quant'
|
||||
inputs.extend([quantized_input_names[0]])
|
||||
# 'position_embedding_quant'
|
||||
inputs.extend([quantized_input_names[1]])
|
||||
# 'segment_embedding_quant'
|
||||
inputs.extend([quantized_input_names[2]])
|
||||
# 'gamma_quant'
|
||||
inputs.extend([quantized_input_names[3]])
|
||||
# 'beta_quant'
|
||||
inputs.extend([quantized_input_names[4]])
|
||||
# 'mask' (optional)
|
||||
inputs.extend([node.input[7] if len(node.input) > 7 else ""])
|
||||
|
||||
# Add all scales:
|
||||
inputs.extend([scale_names[0]])
|
||||
inputs.extend([scale_names[1]])
|
||||
inputs.extend([scale_names[2]])
|
||||
inputs.extend([scale_names[3]])
|
||||
inputs.extend([scale_names[4]])
|
||||
|
||||
# Add all zero points:
|
||||
inputs.extend([zero_point_names[0]])
|
||||
inputs.extend([zero_point_names[1]])
|
||||
inputs.extend([zero_point_names[2]])
|
||||
inputs.extend([zero_point_names[3]])
|
||||
inputs.extend([zero_point_names[4]])
|
||||
|
||||
kwargs = {}
|
||||
for attribute in node.attribute:
|
||||
kwargs.update(attribute_to_kwarg(attribute))
|
||||
kwargs["domain"] = ms_domain
|
||||
|
||||
qembed_layer_norm_node = onnx.helper.make_node(
|
||||
"QEmbedLayerNormalization",
|
||||
inputs,
|
||||
node.output,
|
||||
qembed_layer_norm_name,
|
||||
**kwargs,
|
||||
)
|
||||
nodes.append(qembed_layer_norm_node)
|
||||
|
||||
self.quantizer.new_nodes += nodes
|
||||
@@ -0,0 +1,64 @@
|
||||
from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType
|
||||
from .base_operator import QuantOperatorBase
|
||||
from .qdq_base_operator import QDQOperatorBase
|
||||
|
||||
"""
|
||||
Quantize Gather
|
||||
"""
|
||||
|
||||
|
||||
class GatherQuant(QuantOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def should_quantize(self):
|
||||
if not self.quantizer.should_quantize_node(self.node):
|
||||
return False
|
||||
|
||||
return self.quantizer.is_valid_quantize_weight(self.node.input[0])
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
assert node.op_type == "Gather"
|
||||
|
||||
(
|
||||
quantized_input_names,
|
||||
zero_point_names,
|
||||
scale_names,
|
||||
nodes,
|
||||
) = self.quantizer.quantize_activation(node, [0])
|
||||
if quantized_input_names is None:
|
||||
return super().quantize()
|
||||
|
||||
gather_new_output = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
|
||||
|
||||
# Create an entry for this quantized value
|
||||
q_output = QuantizedValue(
|
||||
node.output[0],
|
||||
gather_new_output,
|
||||
scale_names[0],
|
||||
zero_point_names[0],
|
||||
QuantizedValueType.Input,
|
||||
)
|
||||
self.quantizer.quantized_value_map[node.output[0]] = q_output
|
||||
|
||||
node.output[0] = gather_new_output
|
||||
node.input[0] = quantized_input_names[0]
|
||||
nodes.append(node)
|
||||
|
||||
self.quantizer.new_nodes += nodes
|
||||
|
||||
|
||||
class QDQGather(QDQOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
assert node.op_type == "Gather" or node.op_type == "GatherElements"
|
||||
|
||||
if self.quantizer.is_valid_quantize_weight(node.input[0]) or self.quantizer.force_quantize_no_input_check:
|
||||
self.quantizer.quantize_activation_tensor(node.input[0])
|
||||
self.quantizer.quantize_output_same_as_input(node.output[0], node.input[0], node.name)
|
||||
elif self.quantizer.is_tensor_quantized(node.input[0]):
|
||||
self.quantizer.quantize_output_same_as_input(node.output[0], node.input[0], node.name)
|
||||
@@ -0,0 +1,62 @@
|
||||
import onnx
|
||||
|
||||
from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
|
||||
from .base_operator import QuantOperatorBase
|
||||
|
||||
|
||||
class QGlobalAveragePool(QuantOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
assert node.op_type == "GlobalAveragePool"
|
||||
|
||||
# If input to this node is not quantized then keep this node.
|
||||
if node.input[0] not in self.quantizer.quantized_value_map:
|
||||
return super().quantize()
|
||||
|
||||
quantized_input_value = self.quantizer.quantized_value_map[node.input[0]]
|
||||
|
||||
# Create an entry for output quantized value.
|
||||
quantized_input_value = self.quantizer.quantized_value_map[node.input[0]]
|
||||
(
|
||||
data_found,
|
||||
output_scale_name_from_parameter,
|
||||
output_zp_name_from_parameter,
|
||||
_,
|
||||
_,
|
||||
) = self.quantizer._get_quantization_params(node.output[0])
|
||||
# Just use input scale and zp if parameters for output is not specified.
|
||||
output_scale_name = output_scale_name_from_parameter if data_found else quantized_input_value.scale_name
|
||||
output_zp_name = output_zp_name_from_parameter if data_found else quantized_input_value.zp_name
|
||||
quantized_output_value = QuantizedValue(
|
||||
node.output[0],
|
||||
node.output[0] + TENSOR_NAME_QUANT_SUFFIX,
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
QuantizedValueType.Input,
|
||||
)
|
||||
self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
|
||||
|
||||
kwargs = {}
|
||||
for attribute in node.attribute:
|
||||
kwargs.update(attribute_to_kwarg(attribute))
|
||||
kwargs["domain"] = ms_domain
|
||||
kwargs["channels_last"] = 0
|
||||
qnode_name = node.name + "_quant" if node.name else ""
|
||||
|
||||
qnode = onnx.helper.make_node(
|
||||
"QLinear" + node.op_type,
|
||||
[
|
||||
quantized_input_value.q_name,
|
||||
quantized_input_value.scale_name,
|
||||
quantized_input_value.zp_name,
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
],
|
||||
[quantized_output_value.q_name],
|
||||
qnode_name,
|
||||
**kwargs,
|
||||
)
|
||||
self.quantizer.new_nodes += [qnode]
|
||||
@@ -0,0 +1,172 @@
|
||||
import logging
|
||||
|
||||
import numpy as np # noqa: F401
|
||||
import onnx
|
||||
|
||||
from ..quant_utils import (
|
||||
TENSOR_NAME_QUANT_SUFFIX,
|
||||
QuantizedValue,
|
||||
QuantizedValueType,
|
||||
attribute_to_kwarg,
|
||||
find_by_name, # noqa: F401
|
||||
get_mul_node, # noqa: F401
|
||||
ms_domain,
|
||||
)
|
||||
from .base_operator import QuantOperatorBase # noqa: F401
|
||||
from .matmul import QOpMatMul
|
||||
from .qdq_base_operator import QDQOperatorBase
|
||||
|
||||
|
||||
def is_B_transposed(gemm_node): # noqa: N802
|
||||
transB_attribute = [attr for attr in gemm_node.attribute if attr.name == "transB"] # noqa: N806
|
||||
if transB_attribute:
|
||||
return onnx.helper.get_attribute_value(transB_attribute[0]) > 0
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def get_beta(gemm_node):
|
||||
beta_attribute = [attr for attr in gemm_node.attribute if attr.name == "beta"]
|
||||
if beta_attribute:
|
||||
return onnx.helper.get_attribute_value(beta_attribute[0])
|
||||
|
||||
return 1.0
|
||||
|
||||
|
||||
def set_default_beta(gemm_node):
|
||||
beta_attribute = [attr for attr in gemm_node.attribute if attr.name == "beta"]
|
||||
if beta_attribute:
|
||||
beta_attribute[0].f = 1.0
|
||||
|
||||
return 1.0
|
||||
|
||||
|
||||
class QLinearGemm(QOpMatMul):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
assert node.op_type == "Gemm"
|
||||
|
||||
(
|
||||
data_found,
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
_,
|
||||
_,
|
||||
) = self.quantizer._get_quantization_params(node.output[0])
|
||||
|
||||
if self.quantizer.is_input_a_initializer(node.input[1]) and self.quantizer.is_per_channel():
|
||||
(
|
||||
quantized_input_names,
|
||||
zero_point_names,
|
||||
scale_names,
|
||||
nodes,
|
||||
) = self.quantizer.quantize_activation(node, [0])
|
||||
quant_weight_tuple = self.quantizer.quantize_weight_per_channel(
|
||||
node.input[1],
|
||||
self.quantizer.weight_qType,
|
||||
0 if is_B_transposed(node) else 1,
|
||||
)
|
||||
quantized_input_names.append(quant_weight_tuple[0])
|
||||
zero_point_names.append(quant_weight_tuple[1])
|
||||
scale_names.append(quant_weight_tuple[2])
|
||||
else:
|
||||
# Get Quantized from both activation(input[0]) and weight(input[1])
|
||||
(
|
||||
quantized_input_names,
|
||||
zero_point_names,
|
||||
scale_names,
|
||||
nodes,
|
||||
) = self.quantizer.quantize_activation(node, [0])
|
||||
|
||||
(
|
||||
quantized_input_names_weight,
|
||||
zero_point_names_weight,
|
||||
scale_names_weight,
|
||||
nodes_weight,
|
||||
) = self.quantizer.quantize_weight(node, [1], reduce_range=self.quantizer.reduce_range)
|
||||
quantized_input_names.extend(quantized_input_names_weight)
|
||||
zero_point_names.extend(zero_point_names_weight)
|
||||
scale_names.extend(scale_names_weight)
|
||||
nodes.extend(nodes_weight)
|
||||
|
||||
if not data_found or quantized_input_names is None:
|
||||
return super().quantize()
|
||||
|
||||
quantized_bias_name = ""
|
||||
if len(node.input) == 3:
|
||||
if not self.quantizer.is_input_a_initializer(node.input[2]):
|
||||
return super().quantize()
|
||||
|
||||
# Note: if the quantized type is float 8, the bias is converted into float 16.
|
||||
# cublasLtMatMul only supports (b)float16 or float32 bias.
|
||||
quantized_bias_name = self.quantizer.quantize_bias_static(
|
||||
node.input[2], node.input[0], node.input[1], get_beta(self.node)
|
||||
)
|
||||
|
||||
qgemm_output = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
|
||||
qgemm_name = node.name + "_quant" if node.name else ""
|
||||
|
||||
kwargs = {}
|
||||
for attribute in node.attribute:
|
||||
if attribute.name != "beta":
|
||||
kwargs.update(attribute_to_kwarg(attribute))
|
||||
kwargs["domain"] = ms_domain
|
||||
|
||||
# generate input
|
||||
qgemm_inputs = []
|
||||
for i in range(2):
|
||||
qgemm_inputs.extend([quantized_input_names[i], scale_names[i], zero_point_names[i]])
|
||||
|
||||
qgemm_inputs.extend([quantized_bias_name, output_scale_name, output_zp_name])
|
||||
|
||||
qgemm_node = onnx.helper.make_node("QGemm", qgemm_inputs, [qgemm_output], qgemm_name, **kwargs)
|
||||
nodes.append(qgemm_node)
|
||||
|
||||
# Create an entry for this quantized value
|
||||
q_output = QuantizedValue(
|
||||
node.output[0],
|
||||
qgemm_output,
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
QuantizedValueType.Input,
|
||||
node_type=node.op_type,
|
||||
node_qtype=self.quantizer.weight_qType,
|
||||
)
|
||||
self.quantizer.quantized_value_map[node.output[0]] = q_output
|
||||
|
||||
self.quantizer.new_nodes += nodes
|
||||
|
||||
|
||||
class QDQGemm(QDQOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
assert node.op_type == "Gemm"
|
||||
|
||||
self.quantizer.quantize_activation_tensor(node.input[0])
|
||||
if not self.disable_qdq_for_node_output:
|
||||
self.quantizer.quantize_activation_tensor(node.output[0])
|
||||
|
||||
is_weight_per_channel, weight_axis = self.quantizer.is_tensor_per_channel(
|
||||
node.input[1], default_axis=0 if is_B_transposed(node) else 1
|
||||
)
|
||||
if is_weight_per_channel:
|
||||
self.quantizer.quantize_weight_tensor_per_channel(node.input[1], weight_axis)
|
||||
else:
|
||||
self.quantizer.quantize_weight_tensor(node.input[1])
|
||||
|
||||
if len(node.input) == 3:
|
||||
if self.quantizer.is_input_a_initializer(node.input[2]):
|
||||
self.quantizer.quantize_bias_tensor(
|
||||
node.name, node.input[2], node.input[0], node.input[1], get_beta(self.node)
|
||||
)
|
||||
set_default_beta(self.node)
|
||||
else:
|
||||
logging.warning(
|
||||
f"Bias of Gemm node '{self.node.name}' is not constant. Please exclude this node for better performance."
|
||||
)
|
||||
@@ -0,0 +1,121 @@
|
||||
import numpy
|
||||
import onnx
|
||||
from onnx import onnx_pb as onnx_proto
|
||||
|
||||
from ..quant_utils import QuantType, attribute_to_kwarg, ms_domain # noqa: F401
|
||||
from .base_operator import QuantOperatorBase
|
||||
|
||||
"""
|
||||
Quantize LSTM
|
||||
"""
|
||||
|
||||
|
||||
class LSTMQuant(QuantOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def quantize(self):
|
||||
"""
|
||||
parameter node: LSTM node.
|
||||
parameter new_nodes_list: List of new nodes created before processing this node.
|
||||
return: a list of nodes in topological order that represents quantized Attention node.
|
||||
"""
|
||||
node = self.node
|
||||
assert node.op_type == "LSTM"
|
||||
|
||||
if not self.quantizer.is_valid_quantize_weight(node.input[1]) or not self.quantizer.is_valid_quantize_weight(
|
||||
node.input[2]
|
||||
):
|
||||
super().quantize()
|
||||
return
|
||||
|
||||
model = self.quantizer.model
|
||||
W = model.get_initializer(node.input[1]) # noqa: N806
|
||||
R = model.get_initializer(node.input[2]) # noqa: N806
|
||||
|
||||
if len(W.dims) != 3 or len(R.dims) != 3:
|
||||
super().quantize()
|
||||
return
|
||||
|
||||
[W_num_dir, W_4_hidden_size, W_input_size] = W.dims # noqa: N806
|
||||
[R_num_dir, R_4_hidden_size, R_hidden_size] = R.dims # noqa: N806
|
||||
|
||||
if self.quantizer.is_per_channel():
|
||||
del W.dims[0]
|
||||
del R.dims[0]
|
||||
W.dims[0] = W_num_dir * W_4_hidden_size
|
||||
R.dims[0] = R_num_dir * R_4_hidden_size
|
||||
|
||||
quant_input_weight_tuple = self.quantizer.quantize_weight_per_channel(
|
||||
node.input[1],
|
||||
onnx_proto.TensorProto.INT8,
|
||||
0, # self.quantizer.weight_qType?
|
||||
)
|
||||
quant_recurrent_weight_tuple = self.quantizer.quantize_weight_per_channel(
|
||||
node.input[2],
|
||||
onnx_proto.TensorProto.INT8,
|
||||
0, # self.quantizer.weight_qType?
|
||||
)
|
||||
|
||||
W_quant_weight = model.get_initializer(quant_input_weight_tuple[0]) # noqa: N806
|
||||
R_quant_weight = model.get_initializer(quant_recurrent_weight_tuple[0]) # noqa: N806
|
||||
|
||||
W_quant_array = onnx.numpy_helper.to_array(W_quant_weight) # noqa: N806
|
||||
R_quant_array = onnx.numpy_helper.to_array(R_quant_weight) # noqa: N806
|
||||
|
||||
W_quant_array = numpy.reshape(W_quant_array, (W_num_dir, W_4_hidden_size, W_input_size)) # noqa: N806
|
||||
R_quant_array = numpy.reshape(R_quant_array, (R_num_dir, R_4_hidden_size, R_hidden_size)) # noqa: N806
|
||||
|
||||
W_quant_array = numpy.transpose(W_quant_array, (0, 2, 1)) # noqa: N806
|
||||
R_quant_array = numpy.transpose(R_quant_array, (0, 2, 1)) # noqa: N806
|
||||
|
||||
W_quant_tranposed = onnx.numpy_helper.from_array(W_quant_array, quant_input_weight_tuple[0]) # noqa: N806
|
||||
R_quant_tranposed = onnx.numpy_helper.from_array(R_quant_array, quant_recurrent_weight_tuple[0]) # noqa: N806
|
||||
|
||||
model.remove_initializers([W_quant_weight, R_quant_weight])
|
||||
model.add_initializer(W_quant_tranposed)
|
||||
model.add_initializer(R_quant_tranposed)
|
||||
|
||||
W_quant_zp = model.get_initializer(quant_input_weight_tuple[1]) # noqa: N806
|
||||
R_quant_zp = model.get_initializer(quant_recurrent_weight_tuple[1]) # noqa: N806
|
||||
W_quant_scale = model.get_initializer(quant_input_weight_tuple[2]) # noqa: N806
|
||||
R_quant_scale = model.get_initializer(quant_recurrent_weight_tuple[2]) # noqa: N806
|
||||
|
||||
if self.quantizer.is_per_channel():
|
||||
W_quant_zp.dims[:] = [W_num_dir, W_4_hidden_size]
|
||||
R_quant_zp.dims[:] = [R_num_dir, R_4_hidden_size]
|
||||
W_quant_scale.dims[:] = [W_num_dir, W_4_hidden_size]
|
||||
R_quant_scale.dims[:] = [R_num_dir, R_4_hidden_size]
|
||||
|
||||
inputs = []
|
||||
input_len = len(node.input)
|
||||
inputs.extend([node.input[0]])
|
||||
inputs.extend([quant_input_weight_tuple[0], quant_recurrent_weight_tuple[0]])
|
||||
inputs.extend([node.input[3] if input_len > 3 else ""])
|
||||
inputs.extend([node.input[4] if input_len > 4 else ""])
|
||||
inputs.extend([node.input[5] if input_len > 5 else ""])
|
||||
inputs.extend([node.input[6] if input_len > 6 else ""])
|
||||
inputs.extend([node.input[7] if input_len > 7 else ""])
|
||||
inputs.extend(
|
||||
[
|
||||
quant_input_weight_tuple[2],
|
||||
quant_input_weight_tuple[1],
|
||||
quant_recurrent_weight_tuple[2],
|
||||
quant_recurrent_weight_tuple[1],
|
||||
]
|
||||
)
|
||||
|
||||
kwargs = {}
|
||||
for attribute in node.attribute:
|
||||
if attribute.name == "layout":
|
||||
continue
|
||||
kwargs.update(attribute_to_kwarg(attribute))
|
||||
kwargs["domain"] = ms_domain
|
||||
|
||||
quant_lstm_name = "" if not node.name else node.name + "_quant"
|
||||
quant_lstm_node = onnx.helper.make_node("DynamicQuantizeLSTM", inputs, node.output, quant_lstm_name, **kwargs)
|
||||
self.quantizer.new_nodes.append(quant_lstm_node)
|
||||
|
||||
dequantize_node = self.quantizer._dequantize_value(node.input[0])
|
||||
if dequantize_node is not None:
|
||||
self.quantizer.new_nodes.append(dequantize_node)
|
||||
@@ -0,0 +1,231 @@
|
||||
import itertools
|
||||
import logging
|
||||
|
||||
import onnx
|
||||
from onnx import onnx_pb as onnx_proto
|
||||
|
||||
from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType, find_by_name, get_mul_node
|
||||
from .base_operator import QuantOperatorBase
|
||||
from .qdq_base_operator import QDQOperatorBase
|
||||
|
||||
|
||||
class QOpMatMul(QuantOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def should_quantize(self):
|
||||
if not self.quantizer.should_quantize_node(self.node):
|
||||
logging.debug(f"Ignore MatMul {self.node.name}]")
|
||||
return False
|
||||
|
||||
if (not self.quantizer.is_float_tensor(self.node.input[1])) and (
|
||||
not self.quantizer.is_float_tensor(self.node.input[0])
|
||||
):
|
||||
logging.info(f"Ignore MatMul due to non float inputs {self.node.name}]")
|
||||
return False
|
||||
|
||||
# do not quantize non-constant B matrices for matmul
|
||||
if self.quantizer.q_matmul_const_b_only:
|
||||
if not self.quantizer.find_initializer_in_path(self.node.input[1]):
|
||||
logging.info(f"Ignore MatMul due to non constant B: {self.quantizer.graph_scope}[{self.node.name}]")
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
"""
|
||||
Used when quantize mode is QuantizationMode.IntegerOps.
|
||||
"""
|
||||
|
||||
|
||||
class MatMulInteger(QOpMatMul):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
assert node.op_type == "MatMul"
|
||||
# Get Quantized from both activation(input[0]) and weight(input[1])
|
||||
(
|
||||
quantized_input_names,
|
||||
zero_point_names,
|
||||
scale_names,
|
||||
nodes,
|
||||
) = self.quantizer.quantize_activation(node, [0])
|
||||
|
||||
(
|
||||
quantized_input_names_weight,
|
||||
zero_point_names_weight,
|
||||
scale_names_weight,
|
||||
nodes_weight,
|
||||
) = self.quantizer.quantize_weight(node, [1], reduce_range=True, op_level_per_channel=True)
|
||||
quantized_input_names.extend(quantized_input_names_weight)
|
||||
zero_point_names.extend(zero_point_names_weight)
|
||||
scale_names.extend(scale_names_weight)
|
||||
nodes.extend(nodes_weight)
|
||||
|
||||
matmul_integer_output = node.output[0] + "_output_quantized"
|
||||
matmul_integer_name = node.name + "_quant" if node.name else ""
|
||||
matmul_integer_node = onnx.helper.make_node(
|
||||
"MatMulInteger",
|
||||
quantized_input_names + zero_point_names,
|
||||
[matmul_integer_output],
|
||||
matmul_integer_name,
|
||||
)
|
||||
nodes.append(matmul_integer_node)
|
||||
|
||||
# Add cast operation to cast matmulInteger output to float.
|
||||
cast_op_output = matmul_integer_output + "_cast_output"
|
||||
otype = self.quantizer.get_tensor_type(node.output[0], mandatory=True)
|
||||
cast_node = onnx.helper.make_node(
|
||||
"Cast",
|
||||
[matmul_integer_output],
|
||||
[cast_op_output],
|
||||
matmul_integer_output + "_cast",
|
||||
to=otype,
|
||||
)
|
||||
nodes.append(cast_node)
|
||||
|
||||
# Add mul operation to multiply scales of two inputs.
|
||||
assert len(scale_names) == 2
|
||||
scales_mul_op = (
|
||||
matmul_integer_name + "_scales_mul"
|
||||
if matmul_integer_name
|
||||
else scale_names[0] + "_" + scale_names[1] + "_mul"
|
||||
)
|
||||
|
||||
scales_mul_node = find_by_name(scales_mul_op, self.quantizer.new_nodes)
|
||||
if scales_mul_node is None:
|
||||
scales_mul_node = get_mul_node(scale_names, scales_mul_op + ":0", scales_mul_op)
|
||||
nodes.append(scales_mul_node)
|
||||
|
||||
scales_mul_op_output = scales_mul_node.output[0]
|
||||
|
||||
# Add mul operation to multiply mul_scales_op result with output of MatMulInteger
|
||||
# and make the output of this node the same as output of original matmul node.
|
||||
output_scale_mul_op = ""
|
||||
if matmul_integer_name:
|
||||
output_scale_mul_op = matmul_integer_name + "_output_scale_mul"
|
||||
nodes.append(
|
||||
get_mul_node(
|
||||
[cast_op_output, scales_mul_op_output],
|
||||
node.output[0],
|
||||
output_scale_mul_op,
|
||||
)
|
||||
)
|
||||
self.quantizer.new_nodes += nodes
|
||||
|
||||
|
||||
"""
|
||||
Used when quantize mode is QuantizationMode.QLinearOps
|
||||
"""
|
||||
|
||||
|
||||
class QLinearMatMul(QOpMatMul):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
assert node.op_type == "MatMul"
|
||||
# Get Quantized from both activation(input[0]) and weight(input[1])
|
||||
(
|
||||
quantized_input_names,
|
||||
zero_point_names,
|
||||
scale_names,
|
||||
nodes,
|
||||
) = self.quantizer.quantize_activation(node, [0])
|
||||
|
||||
(
|
||||
quantized_input_names_weight,
|
||||
zero_point_names_weight,
|
||||
scale_names_weight,
|
||||
nodes_weight,
|
||||
) = self.quantizer.quantize_weight(node, [1], reduce_range=True, op_level_per_channel=True)
|
||||
quantized_input_names.extend(quantized_input_names_weight)
|
||||
zero_point_names.extend(zero_point_names_weight)
|
||||
scale_names.extend(scale_names_weight)
|
||||
|
||||
nodes.extend(nodes_weight)
|
||||
(
|
||||
data_found,
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
_,
|
||||
_,
|
||||
) = self.quantizer._get_quantization_params(node.output[0])
|
||||
if not data_found or quantized_input_names is None:
|
||||
return super().quantize()
|
||||
|
||||
qlinear_matmul_output = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
|
||||
qlinear_matmul_name = node.name + "_quant" if node.name else ""
|
||||
|
||||
qlinear_matmul_inputs = []
|
||||
# Input 0
|
||||
qlinear_matmul_inputs.append(quantized_input_names[0])
|
||||
qlinear_matmul_inputs.append(scale_names[0])
|
||||
qlinear_matmul_inputs.append(zero_point_names[0])
|
||||
# Input 1
|
||||
qlinear_matmul_inputs.append(quantized_input_names[1])
|
||||
qlinear_matmul_inputs.append(scale_names[1])
|
||||
qlinear_matmul_inputs.append(zero_point_names[1])
|
||||
# Output quantization parameter
|
||||
qlinear_matmul_inputs.append(output_scale_name)
|
||||
qlinear_matmul_inputs.append(output_zp_name)
|
||||
|
||||
domain = (
|
||||
"com.microsoft"
|
||||
if self.quantizer.weight_qType
|
||||
in {
|
||||
onnx_proto.TensorProto.FLOAT8E4M3FN,
|
||||
onnx_proto.TensorProto.FLOAT8E4M3FNUZ,
|
||||
onnx_proto.TensorProto.FLOAT8E5M2,
|
||||
onnx_proto.TensorProto.FLOAT8E5M2FNUZ,
|
||||
}
|
||||
else ""
|
||||
)
|
||||
qlinear_matmul_node = onnx.helper.make_node(
|
||||
"QLinearMatMul",
|
||||
qlinear_matmul_inputs,
|
||||
[qlinear_matmul_output],
|
||||
qlinear_matmul_name,
|
||||
domain=domain,
|
||||
)
|
||||
nodes.append(qlinear_matmul_node)
|
||||
|
||||
# Create an entry for this quantized value
|
||||
q_output = QuantizedValue(
|
||||
node.output[0],
|
||||
qlinear_matmul_output,
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
QuantizedValueType.Input,
|
||||
)
|
||||
self.quantizer.quantized_value_map[node.output[0]] = q_output
|
||||
|
||||
self.quantizer.new_nodes += nodes
|
||||
|
||||
|
||||
class QDQMatMul(QDQOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
assert node.op_type == "MatMul"
|
||||
|
||||
if self.disable_qdq_for_node_output:
|
||||
nodes_to_iterate = node.input
|
||||
else:
|
||||
nodes_to_iterate = itertools.chain(node.input, node.output)
|
||||
|
||||
for tensor_name in nodes_to_iterate:
|
||||
if find_by_name(tensor_name, self.quantizer.model.initializer()):
|
||||
is_per_channel, channel_axis = self.quantizer.is_tensor_per_channel(
|
||||
tensor_name, default_axis=1, op_type=node.op_type
|
||||
)
|
||||
if is_per_channel:
|
||||
self.quantizer.quantize_weight_tensor_per_channel(tensor_name, channel_axis)
|
||||
else:
|
||||
self.quantizer.quantize_weight_tensor(tensor_name)
|
||||
else:
|
||||
self.quantizer.quantize_activation_tensor(tensor_name)
|
||||
@@ -0,0 +1,34 @@
|
||||
from .direct_q8 import Direct8BitOp, QDQDirect8BitOp
|
||||
|
||||
|
||||
class QMaxPool(Direct8BitOp):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
assert node.op_type == "MaxPool"
|
||||
|
||||
# if version is less than 12, go to normal quantize.
|
||||
if self.quantizer.opset_version < 12:
|
||||
super(Direct8BitOp, self).quantize()
|
||||
return
|
||||
|
||||
# Direct 8bits op
|
||||
return super().quantize()
|
||||
|
||||
|
||||
class QDQMaxPool(QDQDirect8BitOp):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
assert node.op_type == "MaxPool"
|
||||
|
||||
# if version is less than 12, just no change
|
||||
if self.quantizer.opset_version < 12:
|
||||
return
|
||||
|
||||
# Direct 8bits op
|
||||
return super().quantize()
|
||||
@@ -0,0 +1,40 @@
|
||||
# -------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
# --------------------------------------------------------------------------
|
||||
|
||||
from .qdq_base_operator import QDQOperatorBase
|
||||
|
||||
|
||||
class QDQNormalization(QDQOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
assert node.op_type in {"InstanceNormalization", "LayerNormalization", "BatchNormalization"}
|
||||
|
||||
# Input
|
||||
self.quantizer.quantize_activation_tensor(node.input[0])
|
||||
|
||||
# Scale
|
||||
scale_is_initializer = self.quantizer.is_input_a_initializer(node.input[1])
|
||||
scale_is_per_channel, scale_channel_axis = self.quantizer.is_tensor_per_channel(
|
||||
node.input[1], default_axis=1, op_type=node.op_type
|
||||
)
|
||||
|
||||
if scale_is_per_channel:
|
||||
self.quantizer.quantize_weight_tensor_per_channel(node.input[1], axis=scale_channel_axis)
|
||||
elif scale_is_initializer:
|
||||
self.quantizer.quantize_weight_tensor(node.input[1])
|
||||
else:
|
||||
self.quantizer.quantize_activation_tensor(node.input[1])
|
||||
|
||||
# Bias
|
||||
if len(node.input) > 2 and node.input[2]:
|
||||
self.quantizer.quantize_bias_tensor(node.name, node.input[2], node.input[0], node.input[1])
|
||||
|
||||
# Output
|
||||
if not self.disable_qdq_for_node_output:
|
||||
for output_name in node.output:
|
||||
self.quantizer.quantize_activation_tensor(output_name)
|
||||
@@ -0,0 +1,172 @@
|
||||
# --------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
# --------------------------------------------------------------------------
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
import onnx
|
||||
|
||||
from ..quant_utils import (
|
||||
TENSOR_NAME_QUANT_SUFFIX,
|
||||
QuantizedValue,
|
||||
QuantizedValueType,
|
||||
attribute_to_kwarg,
|
||||
quantize_nparray,
|
||||
)
|
||||
from .base_operator import QuantOperatorBase
|
||||
from .qdq_base_operator import QDQOperatorBase
|
||||
|
||||
|
||||
class QPad(QuantOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
assert node.op_type == "Pad"
|
||||
|
||||
# Only after version 11, it has the optional constant_value
|
||||
# If input[0] is not quantized, do not quanitize this node
|
||||
if (self.quantizer.opset_version < 11) or (node.input[0] not in self.quantizer.quantized_value_map):
|
||||
super().quantize()
|
||||
return
|
||||
quantized_input_value = self.quantizer.quantized_value_map[node.input[0]]
|
||||
|
||||
kwargs = {}
|
||||
for attribute in node.attribute:
|
||||
kv = attribute_to_kwarg(attribute)
|
||||
kwargs.update(kv)
|
||||
|
||||
if "mode" not in kwargs or kwargs["mode"] == b"constant":
|
||||
if len(node.input) > 2 and node.input[2] != "": # There is 3rd input 'constant_value'
|
||||
zp_tensor = self.quantizer.model.get_initializer(quantized_input_value.zp_name)
|
||||
scale_tensor = self.quantizer.model.get_initializer(quantized_input_value.scale_name)
|
||||
if zp_tensor is None or scale_tensor is None:
|
||||
super().quantize()
|
||||
return
|
||||
|
||||
padding_constant_initializer = self.quantizer.model.get_initializer(node.input[2])
|
||||
if padding_constant_initializer is not None:
|
||||
zp_array = onnx.numpy_helper.to_array(zp_tensor)
|
||||
zp_value = zp_array.item() if zp_array.ndim == 0 else zp_array[0]
|
||||
scale_array = onnx.numpy_helper.to_array(scale_tensor)
|
||||
scale_value = scale_array.item() if scale_array.ndim == 0 else scale_array[0]
|
||||
padding_constant_array = onnx.numpy_helper.to_array(padding_constant_initializer)
|
||||
quantized_padding_constant_array = quantize_nparray(
|
||||
self.quantizer.activation_qType,
|
||||
padding_constant_array,
|
||||
scale_value,
|
||||
zp_value,
|
||||
)
|
||||
quantized_padding_constant_name = node.input[2] + TENSOR_NAME_QUANT_SUFFIX
|
||||
quantized_padding_constant_initializer = onnx.numpy_helper.from_array(
|
||||
quantized_padding_constant_array,
|
||||
quantized_padding_constant_name,
|
||||
)
|
||||
# Suppose this padding constant initializer only used by the node
|
||||
self.quantizer.model.remove_initializer(padding_constant_initializer)
|
||||
self.quantizer.model.add_initializer(quantized_padding_constant_initializer)
|
||||
node.input[2] = quantized_padding_constant_name
|
||||
else:
|
||||
# TODO: check quantize_inputs after sub graph is supported
|
||||
pad_value_qnodes = self.quantizer._get_quantize_input_nodes(
|
||||
node,
|
||||
2,
|
||||
self.quantizer.activation_qType,
|
||||
quantized_input_value.scale_name,
|
||||
quantized_input_value.zp_name,
|
||||
initial_type=scale_tensor.data_type,
|
||||
)
|
||||
self.quantizer.new_nodes.extend(pad_value_qnodes)
|
||||
node.input[2] = pad_value_qnodes[0].output[0]
|
||||
else:
|
||||
# In quantized format, the `zero` before quantization is mapped
|
||||
# to quantized_input_value.zp_name. Thus, padding 0 to
|
||||
# original tensor should become padding zero point to quantized
|
||||
# tensor.
|
||||
if len(node.input) == 2:
|
||||
# Feed quantization's zero point to padding node.
|
||||
node.input.append(quantized_input_value.zp_name)
|
||||
else:
|
||||
# Assign quantization's zero point to padding node.
|
||||
assert node.input[2] == ""
|
||||
node.input[2] = quantized_input_value.zp_name
|
||||
|
||||
# Create an entry for output quantized value
|
||||
quantized_output_value = QuantizedValue(
|
||||
node.output[0],
|
||||
node.output[0] + TENSOR_NAME_QUANT_SUFFIX,
|
||||
quantized_input_value.scale_name,
|
||||
quantized_input_value.zp_name,
|
||||
QuantizedValueType.Input,
|
||||
)
|
||||
self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
|
||||
|
||||
node.input[0] = quantized_input_value.q_name
|
||||
node.output[0] = quantized_output_value.q_name
|
||||
self.quantizer.new_nodes += [node]
|
||||
|
||||
|
||||
class QDQPad(QDQOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def _get_pad_const_val(self, attrs_dict: dict[str, Any]) -> np.ndarray | None:
|
||||
"""
|
||||
Returns the Pad's constant padding value. Returns `None` if the padding value is
|
||||
not constant (i.e., comes from a dynamic input).
|
||||
"""
|
||||
const_val = None
|
||||
onnx_tensor_type = self.quantizer.model.get_tensor_type(self.node.input[0])
|
||||
if onnx_tensor_type is None:
|
||||
return None
|
||||
|
||||
np_dtype = onnx.helper.tensor_dtype_to_np_dtype(onnx_tensor_type.elem_type)
|
||||
if self.quantizer.opset_version < 11:
|
||||
const_val = np.array(attrs_dict.get("value", 0), dtype=np_dtype)
|
||||
elif len(self.node.input) >= 3 and self.node.input[2]:
|
||||
const_val = self.quantizer.model.get_constant_value(self.node.input[2])
|
||||
else:
|
||||
const_val = np.array(0, dtype=np_dtype)
|
||||
|
||||
return const_val
|
||||
|
||||
def _should_quantize_output_same_as_input(self) -> bool:
|
||||
"""
|
||||
Returns true if Pad's output should use the same quantization parameters as input[0]
|
||||
"""
|
||||
attrs_dict = {}
|
||||
for attribute in self.node.attribute:
|
||||
kv = attribute_to_kwarg(attribute)
|
||||
attrs_dict.update(kv)
|
||||
|
||||
pad_mode = attrs_dict.get("mode", b"constant")
|
||||
if pad_mode in (b"reflect", b"edge", b"wrap"):
|
||||
# These modes pad the output with a value that already exists in the input.
|
||||
# So, we can quantize the output the same as the input.
|
||||
return True
|
||||
|
||||
# For 'constant' mode, if padding with 0, we can also quantize the output the same as the input
|
||||
# because our quantization floating-point range always includes 0.
|
||||
if pad_mode == b"constant":
|
||||
pad_val = self._get_pad_const_val(attrs_dict)
|
||||
if pad_val is not None and pad_val.dtype in (np.float32, np.float16):
|
||||
return float(pad_val.item()) == 0
|
||||
|
||||
return False
|
||||
|
||||
def quantize(self):
|
||||
assert self.node.op_type == "Pad"
|
||||
|
||||
for input_name in self.node.input:
|
||||
if input_name:
|
||||
self.quantizer.quantize_activation_tensor(input_name)
|
||||
|
||||
if not self.disable_qdq_for_node_output:
|
||||
if self._should_quantize_output_same_as_input():
|
||||
self.quantizer.quantize_output_same_as_input(self.node.output[0], self.node.input[0], self.node.name)
|
||||
else:
|
||||
self.quantizer.quantize_activation_tensor(self.node.output[0])
|
||||
@@ -0,0 +1,67 @@
|
||||
import onnx
|
||||
|
||||
from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
|
||||
from .base_operator import QuantOperatorBase
|
||||
|
||||
|
||||
class QLinearPool(QuantOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
|
||||
# only try to quantize when given quantization parameters for it
|
||||
(
|
||||
data_found,
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
_,
|
||||
_,
|
||||
) = self.quantizer._get_quantization_params(node.output[0])
|
||||
|
||||
# get quantized input tensor names, quantize input if needed
|
||||
(
|
||||
quantized_input_names,
|
||||
input_zero_point_names,
|
||||
input_scale_names,
|
||||
nodes,
|
||||
) = self.quantizer.quantize_activation(node, [0])
|
||||
|
||||
if not data_found or quantized_input_names is None:
|
||||
return super().quantize()
|
||||
|
||||
# Create an entry for output quantized value.
|
||||
qlinear_output_name = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
|
||||
quantized_output_value = QuantizedValue(
|
||||
node.output[0],
|
||||
qlinear_output_name,
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
QuantizedValueType.Input,
|
||||
)
|
||||
self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
|
||||
|
||||
# Create qlinear pool node for given type (AveragePool, etc)
|
||||
kwargs = {}
|
||||
for attribute in node.attribute:
|
||||
kwargs.update(attribute_to_kwarg(attribute))
|
||||
kwargs["domain"] = ms_domain
|
||||
qlinear_node_name = node.name + "_quant" if node.name else ""
|
||||
qnode = onnx.helper.make_node(
|
||||
"QLinear" + node.op_type,
|
||||
[
|
||||
quantized_input_names[0],
|
||||
input_scale_names[0],
|
||||
input_zero_point_names[0],
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
],
|
||||
[qlinear_output_name],
|
||||
qlinear_node_name,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
# add all newly created nodes
|
||||
nodes.append(qnode)
|
||||
self.quantizer.new_nodes += nodes
|
||||
@@ -0,0 +1,22 @@
|
||||
import itertools
|
||||
|
||||
from ..quant_utils import QuantizedValue, QuantizedValueType, attribute_to_kwarg, quantize_nparray # noqa: F401
|
||||
from .base_operator import QuantOperatorBase # noqa: F401
|
||||
|
||||
|
||||
class QDQOperatorBase:
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
self.quantizer = onnx_quantizer
|
||||
self.node = onnx_node
|
||||
self.disable_qdq_for_node_output = onnx_node.op_type in onnx_quantizer.op_types_to_exclude_output_quantization
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
|
||||
if self.disable_qdq_for_node_output:
|
||||
tensors_to_quantize = node.input
|
||||
else:
|
||||
tensors_to_quantize = itertools.chain(node.input, node.output)
|
||||
|
||||
for tensor_name in tensors_to_quantize:
|
||||
self.quantizer.quantize_activation_tensor(tensor_name)
|
||||
@@ -0,0 +1,34 @@
|
||||
from .direct_q8 import Direct8BitOp, QDQDirect8BitOp
|
||||
|
||||
|
||||
class QResize(Direct8BitOp):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
assert node.op_type == "Resize"
|
||||
|
||||
# if version is less than 11, go to normal quantize.
|
||||
if self.quantizer.opset_version < 11:
|
||||
super(Direct8BitOp, self).quantize()
|
||||
return
|
||||
|
||||
# Direct 8bits op
|
||||
return super().quantize()
|
||||
|
||||
|
||||
class QDQResize(QDQDirect8BitOp):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
assert node.op_type == "Resize"
|
||||
|
||||
# if version is less than 11, just keep this node
|
||||
if self.quantizer.opset_version < 11:
|
||||
return
|
||||
|
||||
# Direct 8bits op
|
||||
return super().quantize()
|
||||
@@ -0,0 +1,74 @@
|
||||
import onnx
|
||||
import onnx.helper
|
||||
|
||||
from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
|
||||
from .base_operator import QuantOperatorBase
|
||||
|
||||
|
||||
class QLinearSoftmax(QuantOperatorBase):
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
# set limitations for softmax output scale and zp, because the output of softmax is always 0-1
|
||||
if self.quantizer.activation_qType == onnx.onnx_pb.TensorProto.UINT8:
|
||||
out_scale = 1 / 256.0
|
||||
out_zero_point = 0
|
||||
else:
|
||||
out_scale = 1 / 256.0
|
||||
out_zero_point = -128
|
||||
# only try to quantize when given quantization parameters for it
|
||||
(
|
||||
data_found,
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
_,
|
||||
_,
|
||||
) = self.quantizer._get_quantization_params(node.output[0], out_scale, out_zero_point)
|
||||
|
||||
# get quantized input tensor names, quantize input if needed
|
||||
(
|
||||
quantized_input_names,
|
||||
input_zero_point_names,
|
||||
input_scale_names,
|
||||
nodes,
|
||||
) = self.quantizer.quantize_activation(node, [0])
|
||||
|
||||
if not data_found or quantized_input_names is None:
|
||||
return super().quantize()
|
||||
|
||||
# Create an entry for output quantized value.
|
||||
qlinear_output_name = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
|
||||
quantized_output_value = QuantizedValue(
|
||||
node.output[0],
|
||||
qlinear_output_name,
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
QuantizedValueType.Input,
|
||||
)
|
||||
self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
|
||||
|
||||
# Create qlinear softmax node for given type
|
||||
kwargs = {}
|
||||
for attribute in node.attribute:
|
||||
kwargs.update(attribute_to_kwarg(attribute))
|
||||
kwargs["domain"] = ms_domain
|
||||
# make qlinearsoft has the real opset_version, its default SinceVersion would be 1
|
||||
kwargs["opset"] = self.quantizer.opset_version
|
||||
qlinear_node_name = node.name + "_quant" if node.name else ""
|
||||
qnode = onnx.helper.make_node(
|
||||
"QLinear" + node.op_type,
|
||||
[
|
||||
quantized_input_names[0],
|
||||
input_scale_names[0],
|
||||
input_zero_point_names[0],
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
],
|
||||
[qlinear_output_name],
|
||||
qlinear_node_name,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
# add all newly created nodes
|
||||
nodes.append(qnode)
|
||||
self.quantizer.new_nodes += nodes
|
||||
return None
|
||||
@@ -0,0 +1,63 @@
|
||||
import onnx
|
||||
|
||||
from ..quant_utils import QuantizedValue, QuantizedValueType, attribute_to_kwarg
|
||||
from .base_operator import QuantOperatorBase
|
||||
from .qdq_base_operator import QDQOperatorBase
|
||||
|
||||
|
||||
class QSplit(QuantOperatorBase):
|
||||
def __init__(self, onnx_quantizer, onnx_node):
|
||||
super().__init__(onnx_quantizer, onnx_node)
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
(
|
||||
quantized_input_names,
|
||||
zero_point_names,
|
||||
scale_names,
|
||||
nodes,
|
||||
) = self.quantizer.quantize_activation(node, [0])
|
||||
if quantized_input_names is None:
|
||||
return super().quantize()
|
||||
|
||||
quantized_node_name = ""
|
||||
if node.name:
|
||||
quantized_node_name = node.name + "_quant"
|
||||
kwargs = {}
|
||||
for attribute in node.attribute:
|
||||
kwargs.update(attribute_to_kwarg(attribute))
|
||||
|
||||
# Output just derive the scale/zero from input
|
||||
quantized_output_names = []
|
||||
for output_name in node.output:
|
||||
quantized_output_name = output_name + "quantized"
|
||||
quantized_output_names.append(quantized_output_name)
|
||||
q_output = QuantizedValue(
|
||||
output_name,
|
||||
quantized_output_name,
|
||||
scale_names[0],
|
||||
zero_point_names[0],
|
||||
QuantizedValueType.Input,
|
||||
)
|
||||
self.quantizer.quantized_value_map[output_name] = q_output
|
||||
|
||||
if len(node.input) > 1:
|
||||
quantized_input_names.extend(node.input[1:])
|
||||
quantized_node = onnx.helper.make_node(
|
||||
node.op_type, quantized_input_names, quantized_output_names, quantized_node_name, **kwargs
|
||||
)
|
||||
|
||||
nodes.append(quantized_node)
|
||||
self.quantizer.new_nodes += nodes
|
||||
|
||||
|
||||
class QDQSplit(QDQOperatorBase):
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
assert node.op_type == "Split"
|
||||
|
||||
if not self.quantizer.is_tensor_quantized(node.input[0]):
|
||||
self.quantizer.quantize_activation_tensor(node.input[0])
|
||||
if not self.disable_qdq_for_node_output:
|
||||
for output in node.output:
|
||||
self.quantizer.quantize_output_same_as_input(output, node.input[0], node.name)
|
||||
@@ -0,0 +1,87 @@
|
||||
import onnx
|
||||
|
||||
from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
|
||||
from .base_operator import QuantOperatorBase
|
||||
from .qdq_base_operator import QDQOperatorBase
|
||||
|
||||
|
||||
class QLinearWhere(QuantOperatorBase):
|
||||
def should_quantize(self):
|
||||
return True
|
||||
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
assert node.op_type == "Where"
|
||||
if not self.quantizer.force_quantize_no_input_check:
|
||||
self.quantizer.new_nodes += [node]
|
||||
return
|
||||
(
|
||||
data_found,
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
_,
|
||||
_,
|
||||
) = self.quantizer._get_quantization_params(node.output[0])
|
||||
(
|
||||
q_input_names,
|
||||
zero_point_names,
|
||||
scale_names,
|
||||
nodes,
|
||||
) = self.quantizer.quantize_activation(node, [1, 2])
|
||||
if not data_found or q_input_names is None:
|
||||
return super().quantize()
|
||||
qlinear_output = node.output[0] + TENSOR_NAME_QUANT_SUFFIX
|
||||
qlinear_output_name = node.name + "_quant" if node.name else ""
|
||||
|
||||
q_output = QuantizedValue(
|
||||
node.output[0],
|
||||
qlinear_output,
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
QuantizedValueType.Input,
|
||||
)
|
||||
self.quantizer.quantized_value_map[node.output[0]] = q_output
|
||||
|
||||
kwargs = {}
|
||||
for attribute in node.attribute:
|
||||
kwargs.update(attribute_to_kwarg(attribute))
|
||||
kwargs["domain"] = ms_domain
|
||||
|
||||
qlwhere_inputs = [
|
||||
node.input[0],
|
||||
q_input_names[0],
|
||||
scale_names[0],
|
||||
zero_point_names[0],
|
||||
q_input_names[1],
|
||||
scale_names[1],
|
||||
zero_point_names[1],
|
||||
output_scale_name,
|
||||
output_zp_name,
|
||||
]
|
||||
qlwhere_node = onnx.helper.make_node(
|
||||
"QLinearWhere", qlwhere_inputs, [qlinear_output], qlinear_output_name, **kwargs
|
||||
)
|
||||
|
||||
self.quantizer.new_nodes += nodes
|
||||
self.quantizer.new_nodes += [qlwhere_node]
|
||||
|
||||
|
||||
class QDQWhere(QDQOperatorBase):
|
||||
def quantize(self):
|
||||
node = self.node
|
||||
assert node.op_type == "Where"
|
||||
if self.quantizer.force_quantize_no_input_check:
|
||||
if not self.quantizer.is_tensor_quantized(node.input[1]):
|
||||
self.quantizer.quantize_activation_tensor(node.input[1])
|
||||
if not self.quantizer.is_tensor_quantized(node.input[2]):
|
||||
self.quantizer.quantize_activation_tensor(node.input[2])
|
||||
if not self.disable_qdq_for_node_output:
|
||||
for output in node.output:
|
||||
self.quantizer.quantize_activation_tensor(output)
|
||||
elif (
|
||||
self.quantizer.is_tensor_quantized(node.input[1])
|
||||
and self.quantizer.is_tensor_quantized(node.input[2])
|
||||
and not self.disable_qdq_for_node_output
|
||||
):
|
||||
for output in node.output:
|
||||
self.quantizer.quantize_activation_tensor(output)
|
||||
Reference in New Issue
Block a user