voice_bridge/venv/lib/python3.12/site-packages/ctranslate2/converters/fairseq.py

import argparse
import os

from typing import Optional

from ctranslate2.converters import utils
from ctranslate2.converters.converter import Converter
from ctranslate2.specs import common_spec, transformer_spec

_SUPPORTED_MODELS = {
    "bart",
    "multilingual_transformer",
    "transformer",
    "transformer_align",
    "transformer_lm",
}


_SUPPORTED_ACTIVATIONS = {
    "gelu": common_spec.Activation.GELU,
    "gelu_accurate": common_spec.Activation.GELUTanh,
    "gelu_fast": common_spec.Activation.GELUTanh,
    "relu": common_spec.Activation.RELU,
    "swish": common_spec.Activation.SWISH,
}


def _get_model_spec(args):
    import fairseq

    activation_fn = getattr(args, "activation_fn", "relu")
    model_name = fairseq.models.ARCH_MODEL_NAME_REGISTRY[args.arch]

    check = utils.ConfigurationChecker()
    check(
        model_name in _SUPPORTED_MODELS,
        "Model '%s' used by architecture '%s' is not supported (supported models are: %s)"
        % (model_name, args.arch, ", ".join(_SUPPORTED_MODELS)),
    )
    check.validate()
    check(
        activation_fn in _SUPPORTED_ACTIVATIONS,
        "Option --activation-fn %s is not supported (supported activations are: %s)"
        % (activation_fn, ", ".join(_SUPPORTED_ACTIVATIONS.keys())),
    )
    check(
        not getattr(args, "no_token_positional_embeddings", False),
        "Option --no-token-positional-embeddings is not supported",
    )
    check(
        not getattr(args, "lang_tok_replacing_bos_eos", False),
        "Option --lang-tok-replacing-bos-eos is not supported",
    )

    if model_name == "transformer_lm":
        check(
            not args.character_embeddings,
            "Option --character-embeddings is not supported",
        )
        check(
            not args.adaptive_input,
            "Option --adaptive-input is not supported",
        )
        check.validate()

        return transformer_spec.TransformerDecoderModelSpec.from_config(
            args.decoder_layers,
            args.decoder_attention_heads,
            pre_norm=args.decoder_normalize_before,
            activation=_SUPPORTED_ACTIVATIONS[activation_fn],
            layernorm_embedding=getattr(args, "layernorm_embedding", False),
            no_final_norm=args.no_decoder_final_norm,
            project_in_out=args.decoder_input_dim != args.decoder_embed_dim,
        )

    else:
        check(
            args.encoder_normalize_before == args.decoder_normalize_before,
            "Options --encoder-normalize-before and --decoder-normalize-before "
            "must have the same value",
        )
        check(
            args.encoder_attention_heads == args.decoder_attention_heads,
            "Options --encoder-attention-heads and --decoder-attention-heads "
            "must have the same value",
        )
        check.validate()

        return transformer_spec.TransformerSpec.from_config(
            (args.encoder_layers, args.decoder_layers),
            args.encoder_attention_heads,
            pre_norm=args.encoder_normalize_before,
            activation=_SUPPORTED_ACTIVATIONS[activation_fn],
            alignment_layer=getattr(args, "alignment_layer", -1),
            alignment_heads=getattr(args, "alignment_heads", 0),
            layernorm_embedding=getattr(args, "layernorm_embedding", False),
        )


def _get_vocab(dictionary):
    return ["<blank>" if token == "<pad>" else token for token in dictionary.symbols]


class FairseqConverter(Converter):
    """Converts models trained with Fairseq."""

    def __init__(
        self,
        model_path: str,
        data_dir: str,
        source_lang: Optional[str] = None,
        target_lang: Optional[str] = None,
        fixed_dictionary: Optional[str] = None,
        no_default_special_tokens: bool = False,
        user_dir: Optional[str] = None,
    ):
        """Initializes the Fairseq converter.

        Arguments:
          model_path: Path to the Fairseq PyTorch model (.pt file).
          data_dir: Path to the Fairseq data directory containing vocabulary files.
          source_lang: Source language (may be required if not declared in the model).
          target_lang: Target language (may be required if not declared in the model).
          fixed_dictionary: Path to the fixed dictionary for multilingual models.
          no_default_special_tokens: Require all special tokens to be provided by the user
            (e.g. encoder end token, decoder start token).
          user_dir: Path to the user directory containing custom extensions.
        """
        self._model_path = model_path
        self._data_dir = data_dir
        self._fixed_dictionary = fixed_dictionary
        self._source_lang = source_lang
        self._target_lang = target_lang
        self._no_default_special_tokens = no_default_special_tokens
        self._user_dir = user_dir

    def _load(self):
        import fairseq
        import torch

        from fairseq import checkpoint_utils

        if self._user_dir:
            from fairseq.utils import import_user_module

            import_user_module(argparse.Namespace(user_dir=self._user_dir))

        with torch.no_grad():
            checkpoint = checkpoint_utils.load_checkpoint_to_cpu(self._model_path)
            args = checkpoint["args"] or checkpoint["cfg"]["model"]

            args.data = self._data_dir
            if self._fixed_dictionary is not None:
                args.fixed_dictionary = self._fixed_dictionary
            if hasattr(args, "lang_dict") and args.lang_dict:
                args.lang_dict = os.path.join(
                    self._data_dir, os.path.basename(args.lang_dict)
                )

            if self._source_lang is not None:
                args.source_lang = self._source_lang

            if self._target_lang is not None:
                args.target_lang = self._target_lang

            spec = _get_model_spec(args)

            task = fairseq.tasks.setup_task(args)
            model = fairseq.models.build_model(args, task)
            model.eval()
            model.load_state_dict(checkpoint["model"])

            if isinstance(spec, transformer_spec.TransformerDecoderModelSpec):
                set_transformer_decoder(
                    spec.decoder,
                    model.decoder,
                    with_encoder_attention=False,
                )

                spec.register_vocabulary(_get_vocab(task.dictionary))
                if not args.add_bos_token:
                    spec.config.bos_token = spec.config.eos_token

            else:
                set_transformer_encoder(spec.encoder, model.encoder)
                set_transformer_decoder(spec.decoder, model.decoder)

                spec.register_source_vocabulary(_get_vocab(task.source_dictionary))
                spec.register_target_vocabulary(_get_vocab(task.target_dictionary))
                if self._no_default_special_tokens:
                    spec.config.decoder_start_token = None
                else:
                    spec.config.decoder_start_token = spec.config.eos_token
                    spec.config.add_source_eos = True

            return spec


def set_transformer_encoder(spec, module):
    set_input_layers(spec, module)
    for layer_spec, layer in zip(spec.layer, module.layers):
        set_transformer_encoder_layer(layer_spec, layer)
    if module.layer_norm is not None:
        set_layer_norm(spec.layer_norm, module.layer_norm)
    if module.layernorm_embedding is not None:
        set_layer_norm(spec.layernorm_embedding, module.layernorm_embedding)


def set_transformer_decoder(spec, module, with_encoder_attention=True):
    set_input_layers(spec, module)
    set_linear(spec.projection, module.output_projection)
    for layer_spec, layer in zip(spec.layer, module.layers):
        set_transformer_decoder_layer(
            layer_spec,
            layer,
            with_encoder_attention=with_encoder_attention,
        )
    if module.layer_norm is not None:
        set_layer_norm(spec.layer_norm, module.layer_norm)
    if module.layernorm_embedding is not None:
        set_layer_norm(spec.layernorm_embedding, module.layernorm_embedding)
    if module.project_in_dim is not None:
        set_linear(spec.project_in, module.project_in_dim)
    if module.project_out_dim is not None:
        set_linear(spec.project_out, module.project_out_dim)


def set_input_layers(spec, module):
    set_position_encodings(spec.position_encodings, module.embed_positions)
    set_embeddings(
        spec.embeddings[0] if isinstance(spec.embeddings, list) else spec.embeddings,
        module.embed_tokens,
    )
    spec.scale_embeddings = module.embed_scale


def set_transformer_encoder_layer(spec, module):
    set_ffn(spec.ffn, module)
    set_multi_head_attention(spec.self_attention, module.self_attn, self_attention=True)
    set_layer_norm(spec.self_attention.layer_norm, module.self_attn_layer_norm)


def set_transformer_decoder_layer(spec, module, with_encoder_attention=True):
    set_ffn(spec.ffn, module)
    set_multi_head_attention(spec.self_attention, module.self_attn, self_attention=True)
    set_layer_norm(spec.self_attention.layer_norm, module.self_attn_layer_norm)
    if with_encoder_attention:
        set_multi_head_attention(spec.attention, module.encoder_attn)
        set_layer_norm(spec.attention.layer_norm, module.encoder_attn_layer_norm)


def set_ffn(spec, module):
    set_layer_norm(spec.layer_norm, module.final_layer_norm)
    set_linear(spec.linear_0, module.fc1)
    set_linear(spec.linear_1, module.fc2)


def set_multi_head_attention(spec, module, self_attention=False):
    if self_attention:
        split_layers = [common_spec.LinearSpec() for _ in range(3)]
        set_linear(split_layers[0], module.q_proj)
        set_linear(split_layers[1], module.k_proj)
        set_linear(split_layers[2], module.v_proj)
        utils.fuse_linear(spec.linear[0], split_layers)
    else:
        set_linear(spec.linear[0], module.q_proj)
        split_layers = [common_spec.LinearSpec() for _ in range(2)]
        set_linear(split_layers[0], module.k_proj)
        set_linear(split_layers[1], module.v_proj)
        utils.fuse_linear(spec.linear[1], split_layers)
    set_linear(spec.linear[-1], module.out_proj)


def set_layer_norm(spec, module):
    spec.gamma = module.weight.numpy()
    spec.beta = module.bias.numpy()


def set_linear(spec, module):
    spec.weight = module.weight.numpy()
    if module.bias is not None:
        spec.bias = module.bias.numpy()


def set_embeddings(spec, module):
    spec.weight = module.weight.numpy()


def set_position_encodings(spec, module):
    import torch

    weight = module.weight if isinstance(module, torch.nn.Embedding) else module.weights
    spec.encodings = weight.numpy()[module.padding_idx + 1 :]


def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument("--model_path", required=True, help="Model path.")
    parser.add_argument(
        "--data_dir",
        required=True,
        help="Data directory containing the source and target vocabularies.",
    )
    parser.add_argument(
        "--user_dir",
        help="Directory containing custom extensions.",
    )
    parser.add_argument(
        "--fixed_dictionary",
        help="Fixed dictionary for multilingual models.",
    )
    parser.add_argument(
        "--source_lang",
        help="Source language. This argument is used to find dictionary file from `data_dir`.",
    )
    parser.add_argument(
        "--target_lang",
        help="Target language. This argument is used to find dictionary file from `data_dir`.",
    )
    parser.add_argument(
        "--no_default_special_tokens",
        action="store_true",
        help=(
            "Require all special tokens to be provided by the user during inference, "
            "including the decoder start token."
        ),
    )
    Converter.declare_arguments(parser)
    args = parser.parse_args()
    converter = FairseqConverter(
        args.model_path,
        args.data_dir,
        source_lang=args.source_lang,
        target_lang=args.target_lang,
        fixed_dictionary=args.fixed_dictionary,
        no_default_special_tokens=args.no_default_special_tokens,
        user_dir=args.user_dir,
    )
    converter.convert_from_args(args)


if __name__ == "__main__":
    main()