Skip to content

gpt_bigcode

mindnlp.transformers.models.gpt_bigcode.gpt_bigcode

MindNLP gpt_bigcode model

mindnlp.transformers.models.gpt_bigcode.gpt_bigcode.GPTBigCodeAttention

Bases: Module

GPT BigCode Attention

Source code in mindnlp/transformers/models/gpt_bigcode/gpt_bigcode.py
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
class GPTBigCodeAttention(nn.Module):
    """GPT BigCode Attention"""
    def __init__(self, config, is_cross_attention=False, layer_idx=None):
        """
        Initializes the GPTBigCodeAttention class.

        Args:
            self: The instance of the class.
            config: An object containing configuration parameters.
                Must have attributes: multi_query (bool), hidden_size (int), num_attention_heads (int),
                scale_attn_weights (bool), attention_softmax_in_fp32 (bool), scale_attention_softmax_in_fp32 (bool),
                attn_pdrop (float), resid_pdrop (float).
            is_cross_attention: A boolean indicating whether cross-attention is enabled.
            layer_idx: An integer representing the layer index.

        Returns:
            None

        Raises:
            ValueError: If `embed_dim` is not divisible by num_heads.
            NotImplementedError: If cross-attention is enabled and multi-query attention is not supported.
        """
        super().__init__()
        self.mask_value = None

        self.multi_query = config.multi_query
        self.embed_dim = config.hidden_size
        self.num_heads = config.num_attention_heads
        self.head_dim = self.embed_dim // self.num_heads
        self.kv_heads = 1 if self.multi_query else self.num_heads
        self.kv_dim = self.kv_heads * self.head_dim
        self.split_size = self.embed_dim
        if self.head_dim * self.num_heads != self.embed_dim:
            raise ValueError(
                f"`embed_dim` must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
                f" {self.num_heads})."
            )

        self.scale_attn_weights = config.scale_attn_weights
        self.is_cross_attention = is_cross_attention

        self.layer_idx = layer_idx
        self.attention_softmax_in_fp32 = config.attention_softmax_in_fp32
        self.scale_attention_softmax_in_fp32 = (
            config.scale_attention_softmax_in_fp32 and config.attention_softmax_in_fp32
        )

        if self.is_cross_attention:
            if self.multi_query:
                raise NotImplementedError(
                    "Multi-Query Attention not supported for cross_attention")

            self.c_attn = nn.Linear(self.embed_dim, 2 * self.embed_dim)
            self.q_attn = nn.Linear(self.embed_dim, self.embed_dim)
        else:
            self.c_attn = nn.Linear(
                self.embed_dim, self.embed_dim + 2 * self.kv_dim)

        self.c_proj = nn.Linear(self.embed_dim, self.embed_dim)

        self.attn_dropout = nn.Dropout(p=config.attn_pdrop)
        self.resid_dropout = nn.Dropout(p=config.resid_pdrop)

    def _get_mask_value(self, dtype):
        """
        Method _get_mask_value in the class GPTBigCodeAttention.

        Args:
            self (object): The instance of the GPTBigCodeAttention class.
            dtype (str): The data type for the mask value. Should be a valid data type.

        Returns:
            mask_value: Returns the mask value for the specified data type.

        Raises:
            ValueError: If the mask value is None or has a different data type from the specified dtype.
            TypeError: If an invalid data type is provided.
        """
        # torch.where expects a tensor. We use a cache to avoid recreating it every time.
        if self.mask_value is None or self.mask_value.dtype != dtype:
            tmp_value = np.finfo(mindspore.dtype_to_nptype(dtype)).min
            self.mask_value = ops.full([], Tensor(
                tmp_value, dtype=dtype), dtype=dtype)
        return self.mask_value

    def _attn(self, query, key, value, attention_mask=None, head_mask=None):
        """
        This method calculates and applies attention mechanism to the input query, key, and value tensors in the
        GPTBigCodeAttention class.

        Args:
            self: The GPTBigCodeAttention instance.
            query (Tensor): The input query tensor with shape (batch_size, sequence_length, hidden_size)
                if multi_query is False, or (batch_size, sequence_length * num_heads, hidden_size) if multi_query is True.
            key (Tensor): The input key tensor with shape (batch_size * num_heads, hidden_size, sequence_length)
                if multi_query is False, or (batch_size, sequence_length, hidden_size) if multi_query is True.
            value (Tensor): The input value tensor with shape (batch_size * num_heads, sequence_length, hidden_size)
                if multi_query is False, or (batch_size, sequence_length, hidden_size) if multi_query is True.
            attention_mask (Tensor, optional):
                A tensor with shape (batch_size, sequence_length) or
                (batch_size, num_heads, sequence_length, sequence_length) containing attention masks  to be applied to
                the attention scores. Default is None.
            head_mask (Tensor, optional):
                A tensor with shape (batch_size, num_heads, sequence_length) representing the head mask to be applied
                to the attention weights. Default is None.

        Returns:
            Tuple[Tensor, Tensor]:
                A tuple containing the attention output tensor and the attention weights tensor of the specified shapes.

        Raises:
            ValueError: If the shapes of the input tensors are not compatible for the attention calculation.
            TypeError: If the input tensors are not of the expected data type.
            RuntimeError: If an error occurs while performing the attention operation.
        """
        dtype = query.dtype
        softmax_dtype = mindspore.float32 if self.attention_softmax_in_fp32 else dtype
        upcast = dtype != softmax_dtype

        unscale = self.layer_idx + 1 if self.scale_attention_softmax_in_fp32 and upcast else 1
        scale_factor = unscale**-1
        if self.scale_attn_weights:
            scale_factor /= self.head_dim**0.5

        # MQA models: (batch_size, query_length, num_heads * head_dim)
        # MHA models: (batch_size, num_heads, query_length, head_dim)
        query_shape = query.shape
        batch_size = query_shape[0]
        key_length = key.shape[-1]
        if self.multi_query:
            # (batch_size, query_length, num_heads, head_dim) x (batch_size, head_dim, key_length)
            # -> (batch_size, query_length, num_heads, key_length)
            query_length = query_shape[1]
            attn_shape = (batch_size, query_length, self.num_heads, key_length)
            attn_view = (batch_size, query_length * self.num_heads, key_length)
            # No copy needed for MQA 2, or when layer_past is provided.
            query = query.reshape(
                batch_size, query_length * self.num_heads, self.head_dim)
        else:
            # (batch_size, num_heads, query_length, head_dim) x (batch_size, num_heads, head_dim, key_length)
            # -> (batch_size, num_heads, query_length, key_length)
            query_length = query_shape[2]
            attn_shape = (batch_size, self.num_heads, query_length, key_length)
            attn_view = (batch_size * self.num_heads, query_length, key_length)
            # Always copies
            query = query.reshape(
                batch_size * self.num_heads, query_length, self.head_dim)
            # No copy when layer_past is provided.
            key = key.reshape(batch_size * self.num_heads,
                              self.head_dim, key_length)

        attn_weights = mindspore.numpy.empty(
            attn_view, dtype=query.dtype)

        attn_weights = ops.zeros_like(attn_weights)
        beta = 1
        attn_weights = Tensor.baddbmm(
            attn_weights, query, key, beta=beta, alpha=scale_factor).view(attn_shape)

        if upcast:
            if attention_mask is None:
                attn_weights = upcast_softmax(
                    attn_weights, unscale, softmax_dtype)
            else:
                mask_value = self._get_mask_value(softmax_dtype)
                attn_weights = upcast_masked_softmax(
                    attn_weights, attention_mask, mask_value, unscale, softmax_dtype)
        else:
            if attention_mask is not None:
                mask_value = self._get_mask_value(softmax_dtype)

                # The fused kernel is very slow when the key length is not a multiple of 8, so we skip fusion.
                attn_weights = ops.where(
                    Tensor(attention_mask, dtype=mindspore.bool_), attn_weights, mask_value)

            attn_weights = ops.softmax(attn_weights, axis=-1)

        attn_weights = self.attn_dropout(attn_weights)

        # Mask heads if we want to
        if head_mask is not None:
            if self.multi_query:
                head_mask = head_mask.swapaxes(1, 2)
            attn_weights = attn_weights * head_mask

        if self.multi_query:
            attn_output = ops.bmm(attn_weights.view(
                attn_view), value).view(query_shape)
        else:
            attn_output = ops.matmul(attn_weights, value)

        return attn_output, attn_weights

    def forward(
        self,
        hidden_states: mindspore.Tensor,
        layer_past: Optional[mindspore.Tensor] = None,
        attention_mask: Optional[mindspore.Tensor] = None,
        head_mask: Optional[mindspore.Tensor] = None,
        encoder_hidden_states: Optional[mindspore.Tensor] = None,
        encoder_attention_mask: Optional[mindspore.Tensor] = None,
        use_cache: Optional[bool] = False,
        output_attentions: Optional[bool] = False,
    ) -> Union[
        Tuple[mindspore.Tensor, Optional[mindspore.Tensor]],
        Tuple[mindspore.Tensor, Optional[mindspore.Tensor],
              Tuple[mindspore.Tensor, ...]],
    ]:
        """
        Construct method in the GPTBigCodeAttention class.

        Args:
            self: The object instance.
            hidden_states (mindspore.Tensor): The input hidden states to the attention mechanism.
            layer_past (Optional[mindspore.Tensor]): Past hidden states for the layer. Default is None.
            attention_mask (Optional[mindspore.Tensor]): Mask to prevent attention to certain positions. Default is None.
            head_mask (Optional[mindspore.Tensor]): Mask for individual attention heads. Default is None.
            encoder_hidden_states (Optional[mindspore.Tensor]): Hidden states from encoder if cross-attention is used.
                Default is None.
            encoder_attention_mask (Optional[mindspore.Tensor]): Mask for encoder attention. Default is None.
            use_cache (Optional[bool]): Whether to cache the key-value pair for future calls. Default is False.
            output_attentions (Optional[bool]): Whether to output the attention weights. Default is False.

        Returns:
            Union[Tuple[mindspore.Tensor, Optional[mindspore.Tensor]], Tuple[mindspore.Tensor, Optional[mindspore.Tensor], Tuple[mindspore.Tensor, ...]]]:
                Tuple containing the attention output tensor and optionally the present key-value pair and attention
                weights.

        Raises:
            ValueError: If 'q_attn' weights are not defined for cross-attention or if class is not instantiated
                with 'is_cross_attention=True'.
        """
        if encoder_hidden_states is not None:
            if not hasattr(self, "q_attn") or not self.is_cross_attention:
                raise ValueError(
                    "If class is used as cross attention, the weights `q_attn` have to be defined. "
                    "Please make sure to instantiate class with `GPTBigCodeAttention(..., is_cross_attention=True)`."
                )

            query = self.q_attn(hidden_states)
            key_value = self.c_attn(encoder_hidden_states)
            attention_mask = encoder_attention_mask.bool()
        elif self.multi_query:
            query, key_value = self.c_attn(hidden_states).split(
                (self.embed_dim, 2 * self.kv_dim), axis=2)
        else:
            # Note: We split as (self.num_heads, 3, self.head_dim) instead of (3, self.num_heads, self.head_dim),
            # i.e., the memory layout is not the same as GPT2.
            # This makes the concatenation with past_key_value more efficient.
            query, key_value = (
                self.c_attn(hidden_states)
                .view(*hidden_states.shape[:2], self.num_heads, 3 * self.head_dim)
                .swapaxes(1, 2)
                .split((self.head_dim, 2 * self.head_dim), axis=3)
            )

        if layer_past is not None:
            key_value = ops.cat((layer_past, key_value), axis=-2)
        present = key_value if use_cache else None

        key, value = key_value.split((self.head_dim, self.head_dim), axis=-1)

        attn_output, attn_weights = self._attn(
            query, key.swapaxes(-1, -2), value, attention_mask, head_mask)

        if not self.multi_query:
            attn_output = attn_output.swapaxes(
                1, 2).reshape(hidden_states.shape)

        attn_output = self.c_proj(attn_output)
        attn_output = self.resid_dropout(attn_output)

        outputs = (attn_output, present)
        if output_attentions:
            if self.multi_query:
                # Transpose to return weights in the usual format (batch_size, num_heads, query_length, key_length)
                attn_weights = attn_weights.swapaxes(1, 2)
            outputs += (attn_weights,)

        return outputs  # a, present, (attentions)

mindnlp.transformers.models.gpt_bigcode.gpt_bigcode.GPTBigCodeAttention.__init__(config, is_cross_attention=False, layer_idx=None)

Initializes the GPTBigCodeAttention class.

PARAMETER DESCRIPTION
self

The instance of the class.

config

An object containing configuration parameters. Must have attributes: multi_query (bool), hidden_size (int), num_attention_heads (int), scale_attn_weights (bool), attention_softmax_in_fp32 (bool), scale_attention_softmax_in_fp32 (bool), attn_pdrop (float), resid_pdrop (float).

is_cross_attention

A boolean indicating whether cross-attention is enabled.

DEFAULT: False

layer_idx

An integer representing the layer index.

DEFAULT: None

RETURNS DESCRIPTION

None

RAISES DESCRIPTION
ValueError

If embed_dim is not divisible by num_heads.

NotImplementedError

If cross-attention is enabled and multi-query attention is not supported.

Source code in mindnlp/transformers/models/gpt_bigcode/gpt_bigcode.py
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
def __init__(self, config, is_cross_attention=False, layer_idx=None):
    """
    Initializes the GPTBigCodeAttention class.

    Args:
        self: The instance of the class.
        config: An object containing configuration parameters.
            Must have attributes: multi_query (bool), hidden_size (int), num_attention_heads (int),
            scale_attn_weights (bool), attention_softmax_in_fp32 (bool), scale_attention_softmax_in_fp32 (bool),
            attn_pdrop (float), resid_pdrop (float).
        is_cross_attention: A boolean indicating whether cross-attention is enabled.
        layer_idx: An integer representing the layer index.

    Returns:
        None

    Raises:
        ValueError: If `embed_dim` is not divisible by num_heads.
        NotImplementedError: If cross-attention is enabled and multi-query attention is not supported.
    """
    super().__init__()
    self.mask_value = None

    self.multi_query = config.multi_query
    self.embed_dim = config.hidden_size
    self.num_heads = config.num_attention_heads
    self.head_dim = self.embed_dim // self.num_heads
    self.kv_heads = 1 if self.multi_query else self.num_heads
    self.kv_dim = self.kv_heads * self.head_dim
    self.split_size = self.embed_dim
    if self.head_dim * self.num_heads != self.embed_dim:
        raise ValueError(
            f"`embed_dim` must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
            f" {self.num_heads})."
        )

    self.scale_attn_weights = config.scale_attn_weights
    self.is_cross_attention = is_cross_attention

    self.layer_idx = layer_idx
    self.attention_softmax_in_fp32 = config.attention_softmax_in_fp32
    self.scale_attention_softmax_in_fp32 = (
        config.scale_attention_softmax_in_fp32 and config.attention_softmax_in_fp32
    )

    if self.is_cross_attention:
        if self.multi_query:
            raise NotImplementedError(
                "Multi-Query Attention not supported for cross_attention")

        self.c_attn = nn.Linear(self.embed_dim, 2 * self.embed_dim)
        self.q_attn = nn.Linear(self.embed_dim, self.embed_dim)
    else:
        self.c_attn = nn.Linear(
            self.embed_dim, self.embed_dim + 2 * self.kv_dim)

    self.c_proj = nn.Linear(self.embed_dim, self.embed_dim)

    self.attn_dropout = nn.Dropout(p=config.attn_pdrop)
    self.resid_dropout = nn.Dropout(p=config.resid_pdrop)

mindnlp.transformers.models.gpt_bigcode.gpt_bigcode.GPTBigCodeAttention.forward(hidden_states, layer_past=None, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, use_cache=False, output_attentions=False)

Construct method in the GPTBigCodeAttention class.

PARAMETER DESCRIPTION
self

The object instance.

hidden_states

The input hidden states to the attention mechanism.

TYPE: Tensor

layer_past

Past hidden states for the layer. Default is None.

TYPE: Optional[Tensor] DEFAULT: None

attention_mask

Mask to prevent attention to certain positions. Default is None.

TYPE: Optional[Tensor] DEFAULT: None

head_mask

Mask for individual attention heads. Default is None.

TYPE: Optional[Tensor] DEFAULT: None

encoder_hidden_states

Hidden states from encoder if cross-attention is used. Default is None.

TYPE: Optional[Tensor] DEFAULT: None

encoder_attention_mask

Mask for encoder attention. Default is None.

TYPE: Optional[Tensor] DEFAULT: None

use_cache

Whether to cache the key-value pair for future calls. Default is False.

TYPE: Optional[bool] DEFAULT: False

output_attentions

Whether to output the attention weights. Default is False.

TYPE: Optional[bool] DEFAULT: False

RETURNS DESCRIPTION
Union[Tuple[Tensor, Optional[Tensor]], Tuple[Tensor, Optional[Tensor], Tuple[Tensor, ...]]]

Union[Tuple[mindspore.Tensor, Optional[mindspore.Tensor]], Tuple[mindspore.Tensor, Optional[mindspore.Tensor], Tuple[mindspore.Tensor, ...]]]: Tuple containing the attention output tensor and optionally the present key-value pair and attention weights.

RAISES DESCRIPTION
ValueError

If 'q_attn' weights are not defined for cross-attention or if class is not instantiated with 'is_cross_attention=True'.

Source code in mindnlp/transformers/models/gpt_bigcode/gpt_bigcode.py
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
def forward(
    self,
    hidden_states: mindspore.Tensor,
    layer_past: Optional[mindspore.Tensor] = None,
    attention_mask: Optional[mindspore.Tensor] = None,
    head_mask: Optional[mindspore.Tensor] = None,
    encoder_hidden_states: Optional[mindspore.Tensor] = None,
    encoder_attention_mask: Optional[mindspore.Tensor] = None,
    use_cache: Optional[bool] = False,
    output_attentions: Optional[bool] = False,
) -> Union[
    Tuple[mindspore.Tensor, Optional[mindspore.Tensor]],
    Tuple[mindspore.Tensor, Optional[mindspore.Tensor],
          Tuple[mindspore.Tensor, ...]],
]:
    """
    Construct method in the GPTBigCodeAttention class.

    Args:
        self: The object instance.
        hidden_states (mindspore.Tensor): The input hidden states to the attention mechanism.
        layer_past (Optional[mindspore.Tensor]): Past hidden states for the layer. Default is None.
        attention_mask (Optional[mindspore.Tensor]): Mask to prevent attention to certain positions. Default is None.
        head_mask (Optional[mindspore.Tensor]): Mask for individual attention heads. Default is None.
        encoder_hidden_states (Optional[mindspore.Tensor]): Hidden states from encoder if cross-attention is used.
            Default is None.
        encoder_attention_mask (Optional[mindspore.Tensor]): Mask for encoder attention. Default is None.
        use_cache (Optional[bool]): Whether to cache the key-value pair for future calls. Default is False.
        output_attentions (Optional[bool]): Whether to output the attention weights. Default is False.

    Returns:
        Union[Tuple[mindspore.Tensor, Optional[mindspore.Tensor]], Tuple[mindspore.Tensor, Optional[mindspore.Tensor], Tuple[mindspore.Tensor, ...]]]:
            Tuple containing the attention output tensor and optionally the present key-value pair and attention
            weights.

    Raises:
        ValueError: If 'q_attn' weights are not defined for cross-attention or if class is not instantiated
            with 'is_cross_attention=True'.
    """
    if encoder_hidden_states is not None:
        if not hasattr(self, "q_attn") or not self.is_cross_attention:
            raise ValueError(
                "If class is used as cross attention, the weights `q_attn` have to be defined. "
                "Please make sure to instantiate class with `GPTBigCodeAttention(..., is_cross_attention=True)`."
            )

        query = self.q_attn(hidden_states)
        key_value = self.c_attn(encoder_hidden_states)
        attention_mask = encoder_attention_mask.bool()
    elif self.multi_query:
        query, key_value = self.c_attn(hidden_states).split(
            (self.embed_dim, 2 * self.kv_dim), axis=2)
    else:
        # Note: We split as (self.num_heads, 3, self.head_dim) instead of (3, self.num_heads, self.head_dim),
        # i.e., the memory layout is not the same as GPT2.
        # This makes the concatenation with past_key_value more efficient.
        query, key_value = (
            self.c_attn(hidden_states)
            .view(*hidden_states.shape[:2], self.num_heads, 3 * self.head_dim)
            .swapaxes(1, 2)
            .split((self.head_dim, 2 * self.head_dim), axis=3)
        )

    if layer_past is not None:
        key_value = ops.cat((layer_past, key_value), axis=-2)
    present = key_value if use_cache else None

    key, value = key_value.split((self.head_dim, self.head_dim), axis=-1)

    attn_output, attn_weights = self._attn(
        query, key.swapaxes(-1, -2), value, attention_mask, head_mask)

    if not self.multi_query:
        attn_output = attn_output.swapaxes(
            1, 2).reshape(hidden_states.shape)

    attn_output = self.c_proj(attn_output)
    attn_output = self.resid_dropout(attn_output)

    outputs = (attn_output, present)
    if output_attentions:
        if self.multi_query:
            # Transpose to return weights in the usual format (batch_size, num_heads, query_length, key_length)
            attn_weights = attn_weights.swapaxes(1, 2)
        outputs += (attn_weights,)

    return outputs  # a, present, (attentions)

mindnlp.transformers.models.gpt_bigcode.gpt_bigcode.GPTBigCodeBlock

Bases: Module

GPT BigCode Block

Source code in mindnlp/transformers/models/gpt_bigcode/gpt_bigcode.py
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
class GPTBigCodeBlock(nn.Module):
    """GPT BigCode Block"""
    def __init__(self, config, layer_idx=None):
        """
        Initializes an instance of the GPTBigCodeBlock class.

        Args:
            self: The object instance.
            config (object): An object containing configuration settings for the GPTBigCodeBlock.
            layer_idx (int, optional): The index of the layer. Defaults to None.

        Returns:
            None

        Raises:
            NotImplementedError: If cross-attention is enabled with multi-query architecture (MQA).

        """
        super().__init__()
        hidden_size = config.hidden_size
        self.inner_dim = config.n_inner if config.n_inner is not None else 4 * hidden_size

        self.ln_1 = nn.LayerNorm(
            [hidden_size], eps=config.layer_norm_epsilon)
        self.attn = GPTBigCodeAttention(config, layer_idx=layer_idx)
        self.ln_2 = nn.LayerNorm(
            [hidden_size], eps=config.layer_norm_epsilon)

        if config.add_cross_attention:
            if config.multi_query:
                raise NotImplementedError(
                    "Cross-attention not implemented for MQA")
            self.crossattention = GPTBigCodeAttention(
                config, is_cross_attention=True, layer_idx=layer_idx)
            self.ln_cross_attn = nn.LayerNorm(
                hidden_size, eps=config.layer_norm_epsilon)

        self.mlp = GPTBigCodeMLP(self.inner_dim, config)

    def forward(
        self,
        hidden_states: Optional[Tuple[mindspore.Tensor]],
        layer_past: Optional[mindspore.Tensor] = None,
        attention_mask: Optional[mindspore.Tensor] = None,
        head_mask: Optional[mindspore.Tensor] = None,
        encoder_hidden_states: Optional[mindspore.Tensor] = None,
        encoder_attention_mask: Optional[mindspore.Tensor] = None,
        use_cache: Optional[bool] = False,
        output_attentions: Optional[bool] = False,
    ) -> Union[
        Tuple[mindspore.Tensor], Tuple[mindspore.Tensor,
                                       mindspore.Tensor], Tuple[mindspore.Tensor, mindspore.Tensor, mindspore.Tensor]
    ]:
        """
        This method forwards a GPT (Generative Pre-trained Transformer) big code block.

        Args:
            self: The instance of the class.
            hidden_states (Optional[Tuple[mindspore.Tensor]]): The input hidden states.
            layer_past (Optional[mindspore.Tensor]): The past hidden states of the layer.
            attention_mask (Optional[mindspore.Tensor]): The attention mask to mask some positions in the input.
            head_mask (Optional[mindspore.Tensor]): The mask applied to the heads of the multi-head attention.
            encoder_hidden_states (Optional[mindspore.Tensor]): The hidden states of the encoder.
            encoder_attention_mask (Optional[mindspore.Tensor]): The attention mask for the encoder.
            use_cache (Optional[bool]): Flag to indicate whether to use cache for faster decoding.
            output_attentions (Optional[bool]): Flag to indicate whether to output attentions.

        Returns:
            Union[Tuple[mindspore.Tensor], Tuple[mindspore.Tensor, mindspore.Tensor], Tuple[mindspore.Tensor, mindspore.Tensor, mindspore.Tensor]]:
                The output of the method which may include the hidden states and optionally attention scores.

        Raises:
            ValueError:
                If `encoder_hidden_states` are passed but the cross-attention layers are not instantiated with the
                flag `config.add_cross_attention=True`.
        """
        residual = hidden_states
        hidden_states = self.ln_1(hidden_states)
        attn_outputs = self.attn(
            hidden_states,
            layer_past=layer_past,
            attention_mask=attention_mask,
            head_mask=head_mask,
            use_cache=use_cache,
            output_attentions=output_attentions,
        )
        attn_output = attn_outputs[0]  # output_attn: a, present, (attentions)
        outputs = attn_outputs[1:]
        # residual connection
        hidden_states = attn_output + residual

        if encoder_hidden_states is not None:
            # add one self-attention block for cross-attention
            if not hasattr(self, "crossattention"):
                raise ValueError(
                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with "
                    "cross-attention layers by setting `config.add_cross_attention=True`"
                )
            residual = hidden_states
            hidden_states = self.ln_cross_attn(hidden_states)
            cross_attn_outputs = self.crossattention(
                hidden_states,
                attention_mask=attention_mask,
                head_mask=head_mask,
                encoder_hidden_states=encoder_hidden_states,
                encoder_attention_mask=encoder_attention_mask,
                output_attentions=output_attentions,
            )
            attn_output = cross_attn_outputs[0]
            # residual connection
            hidden_states = residual + attn_output
            # add cross attentions if we output attention weights
            outputs = outputs + cross_attn_outputs[2:]

        residual = hidden_states
        hidden_states = self.ln_2(hidden_states)
        feed_forward_hidden_states = self.mlp(hidden_states)
        # residual connection
        hidden_states = residual + feed_forward_hidden_states

        if use_cache:
            outputs = (hidden_states,) + outputs
        else:
            outputs = (hidden_states,) + outputs[1:]

        # hidden_states, present, (attentions, cross_attentions)
        return outputs

mindnlp.transformers.models.gpt_bigcode.gpt_bigcode.GPTBigCodeBlock.__init__(config, layer_idx=None)

Initializes an instance of the GPTBigCodeBlock class.

PARAMETER DESCRIPTION
self

The object instance.

config

An object containing configuration settings for the GPTBigCodeBlock.

TYPE: object

layer_idx

The index of the layer. Defaults to None.

TYPE: int DEFAULT: None

RETURNS DESCRIPTION

None

RAISES DESCRIPTION
NotImplementedError

If cross-attention is enabled with multi-query architecture (MQA).

Source code in mindnlp/transformers/models/gpt_bigcode/gpt_bigcode.py
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
def __init__(self, config, layer_idx=None):
    """
    Initializes an instance of the GPTBigCodeBlock class.

    Args:
        self: The object instance.
        config (object): An object containing configuration settings for the GPTBigCodeBlock.
        layer_idx (int, optional): The index of the layer. Defaults to None.

    Returns:
        None

    Raises:
        NotImplementedError: If cross-attention is enabled with multi-query architecture (MQA).

    """
    super().__init__()
    hidden_size = config.hidden_size
    self.inner_dim = config.n_inner if config.n_inner is not None else 4 * hidden_size

    self.ln_1 = nn.LayerNorm(
        [hidden_size], eps=config.layer_norm_epsilon)
    self.attn = GPTBigCodeAttention(config, layer_idx=layer_idx)
    self.ln_2 = nn.LayerNorm(
        [hidden_size], eps=config.layer_norm_epsilon)

    if config.add_cross_attention:
        if config.multi_query:
            raise NotImplementedError(
                "Cross-attention not implemented for MQA")
        self.crossattention = GPTBigCodeAttention(
            config, is_cross_attention=True, layer_idx=layer_idx)
        self.ln_cross_attn = nn.LayerNorm(
            hidden_size, eps=config.layer_norm_epsilon)

    self.mlp = GPTBigCodeMLP(self.inner_dim, config)

mindnlp.transformers.models.gpt_bigcode.gpt_bigcode.GPTBigCodeBlock.forward(hidden_states, layer_past=None, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, use_cache=False, output_attentions=False)

This method forwards a GPT (Generative Pre-trained Transformer) big code block.

PARAMETER DESCRIPTION
self

The instance of the class.

hidden_states

The input hidden states.

TYPE: Optional[Tuple[Tensor]]

layer_past

The past hidden states of the layer.

TYPE: Optional[Tensor] DEFAULT: None

attention_mask

The attention mask to mask some positions in the input.

TYPE: Optional[Tensor] DEFAULT: None

head_mask

The mask applied to the heads of the multi-head attention.

TYPE: Optional[Tensor] DEFAULT: None

encoder_hidden_states

The hidden states of the encoder.

TYPE: Optional[Tensor] DEFAULT: None

encoder_attention_mask

The attention mask for the encoder.

TYPE: Optional[Tensor] DEFAULT: None

use_cache

Flag to indicate whether to use cache for faster decoding.

TYPE: Optional[bool] DEFAULT: False

output_attentions

Flag to indicate whether to output attentions.

TYPE: Optional[bool] DEFAULT: False

RETURNS DESCRIPTION
Union[Tuple[Tensor], Tuple[Tensor, Tensor], Tuple[Tensor, Tensor, Tensor]]

Union[Tuple[mindspore.Tensor], Tuple[mindspore.Tensor, mindspore.Tensor], Tuple[mindspore.Tensor, mindspore.Tensor, mindspore.Tensor]]: The output of the method which may include the hidden states and optionally attention scores.

RAISES DESCRIPTION
ValueError

If encoder_hidden_states are passed but the cross-attention layers are not instantiated with the flag config.add_cross_attention=True.

Source code in mindnlp/transformers/models/gpt_bigcode/gpt_bigcode.py
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
def forward(
    self,
    hidden_states: Optional[Tuple[mindspore.Tensor]],
    layer_past: Optional[mindspore.Tensor] = None,
    attention_mask: Optional[mindspore.Tensor] = None,
    head_mask: Optional[mindspore.Tensor] = None,
    encoder_hidden_states: Optional[mindspore.Tensor] = None,
    encoder_attention_mask: Optional[mindspore.Tensor] = None,
    use_cache: Optional[bool] = False,
    output_attentions: Optional[bool] = False,
) -> Union[
    Tuple[mindspore.Tensor], Tuple[mindspore.Tensor,
                                   mindspore.Tensor], Tuple[mindspore.Tensor, mindspore.Tensor, mindspore.Tensor]
]:
    """
    This method forwards a GPT (Generative Pre-trained Transformer) big code block.

    Args:
        self: The instance of the class.
        hidden_states (Optional[Tuple[mindspore.Tensor]]): The input hidden states.
        layer_past (Optional[mindspore.Tensor]): The past hidden states of the layer.
        attention_mask (Optional[mindspore.Tensor]): The attention mask to mask some positions in the input.
        head_mask (Optional[mindspore.Tensor]): The mask applied to the heads of the multi-head attention.
        encoder_hidden_states (Optional[mindspore.Tensor]): The hidden states of the encoder.
        encoder_attention_mask (Optional[mindspore.Tensor]): The attention mask for the encoder.
        use_cache (Optional[bool]): Flag to indicate whether to use cache for faster decoding.
        output_attentions (Optional[bool]): Flag to indicate whether to output attentions.

    Returns:
        Union[Tuple[mindspore.Tensor], Tuple[mindspore.Tensor, mindspore.Tensor], Tuple[mindspore.Tensor, mindspore.Tensor, mindspore.Tensor]]:
            The output of the method which may include the hidden states and optionally attention scores.

    Raises:
        ValueError:
            If `encoder_hidden_states` are passed but the cross-attention layers are not instantiated with the
            flag `config.add_cross_attention=True`.
    """
    residual = hidden_states
    hidden_states = self.ln_1(hidden_states)
    attn_outputs = self.attn(
        hidden_states,
        layer_past=layer_past,
        attention_mask=attention_mask,
        head_mask=head_mask,
        use_cache=use_cache,
        output_attentions=output_attentions,
    )
    attn_output = attn_outputs[0]  # output_attn: a, present, (attentions)
    outputs = attn_outputs[1:]
    # residual connection
    hidden_states = attn_output + residual

    if encoder_hidden_states is not None:
        # add one self-attention block for cross-attention
        if not hasattr(self, "crossattention"):
            raise ValueError(
                f"If `encoder_hidden_states` are passed, {self} has to be instantiated with "
                "cross-attention layers by setting `config.add_cross_attention=True`"
            )
        residual = hidden_states
        hidden_states = self.ln_cross_attn(hidden_states)
        cross_attn_outputs = self.crossattention(
            hidden_states,
            attention_mask=attention_mask,
            head_mask=head_mask,
            encoder_hidden_states=encoder_hidden_states,
            encoder_attention_mask=encoder_attention_mask,
            output_attentions=output_attentions,
        )
        attn_output = cross_attn_outputs[0]
        # residual connection
        hidden_states = residual + attn_output
        # add cross attentions if we output attention weights
        outputs = outputs + cross_attn_outputs[2:]

    residual = hidden_states
    hidden_states = self.ln_2(hidden_states)
    feed_forward_hidden_states = self.mlp(hidden_states)
    # residual connection
    hidden_states = residual + feed_forward_hidden_states

    if use_cache:
        outputs = (hidden_states,) + outputs
    else:
        outputs = (hidden_states,) + outputs[1:]

    # hidden_states, present, (attentions, cross_attentions)
    return outputs

mindnlp.transformers.models.gpt_bigcode.gpt_bigcode.GPTBigCodeForCausalLM

Bases: GPTBigCodePreTrainedModel

GPT BigCode for CausalLM

Source code in mindnlp/transformers/models/gpt_bigcode/gpt_bigcode.py
 916
 917
 918
 919
 920
 921
 922
 923
 924
 925
 926
 927
 928
 929
 930
 931
 932
 933
 934
 935
 936
 937
 938
 939
 940
 941
 942
 943
 944
 945
 946
 947
 948
 949
 950
 951
 952
 953
 954
 955
 956
 957
 958
 959
 960
 961
 962
 963
 964
 965
 966
 967
 968
 969
 970
 971
 972
 973
 974
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
class GPTBigCodeForCausalLM(GPTBigCodePreTrainedModel):
    """GPT BigCode for CausalLM"""
    _tied_weights_keys = ["lm_head.weight"]

    def __init__(self, config):
        """
        Initializes the GPTBigCodeForCausalLM class.

        Args:
            self (object): The instance of the class.
            config (object): A configuration object containing settings for the GPTBigCodeForCausalLM model.
                It should include parameters such as n_embd (embedding dimension) and vocab_size (vocabulary size).

        Returns:
            None.

        Raises:
            None.
        """
        super().__init__(config)
        self.transformer = GPTBigCodeModel(config)
        self.lm_head = nn.Linear(
            config.n_embd, config.vocab_size, bias=False)

        # Initialize weights and apply final processing
        self.post_init()

    def get_output_embeddings(self):
        """
        Returns the output embeddings of the GPTBigCodeForCausalLM model.

        Args:
            self (GPTBigCodeForCausalLM): The instance of the GPTBigCodeForCausalLM class.

        Returns:
            None.

        Raises:
            None.
        """
        return self.lm_head

    def set_output_embeddings(self, new_embeddings):
        """
        Sets the output embeddings for the GPTBigCodeForCausalLM model.

        Args:
            self (GPTBigCodeForCausalLM): The instance of the GPTBigCodeForCausalLM class.
            new_embeddings (object): The new embeddings to be set as output embeddings for the model.

        Returns:
            None.

        Raises:
            None.
        """
        self.lm_head = new_embeddings

    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
        '''
        Prepare inputs for generation.

        Args:
            self (GPTBigCodeForCausalLM): An instance of the GPTBigCodeForCausalLM class.
            input_ids (torch.Tensor): The input tensor of shape [batch_size, sequence_length].
            past_key_values (tuple, optional): The tuple of past key values. Default is None.
            inputs_embeds (torch.Tensor, optional):
                The embedded inputs tensor of shape [batch_size, sequence_length, embedding_size]. Default is None.

        Returns:
            dict: A dictionary containing the model inputs for generation.
                The dictionary may contain the following keys:

                - 'inputs_embeds' (torch.Tensor): The embedded inputs tensor.
                - 'input_ids' (torch.Tensor): The input tensor.
                - 'past_key_values' (tuple): The tuple of past key values.
                - 'use_cache' (bool): Whether to use cache.
                - 'position_ids' (torch.Tensor): The position ids tensor.
                - 'attention_mask' (torch.Tensor): The attention mask tensor.
                - 'token_type_ids' (torch.Tensor): The token type ids tensor.

        Raises:
            None.
        '''
        token_type_ids = kwargs.get("token_type_ids", None)
        # Omit tokens covered by past_key_values
        if past_key_values:
            if self.config.multi_query:
                past_length = past_key_values[0].shape[1]
            else:
                past_length = past_key_values[0].shape[2]

            # Some generation methods already pass only the last input ID
            if input_ids.shape[1] > past_length:
                remove_prefix_length = past_length
            else:
                # Default to old behavior: keep only final ID
                remove_prefix_length = input_ids.shape[1] - 1

            input_ids = input_ids[:, remove_prefix_length:]
            if token_type_ids is not None:
                token_type_ids = token_type_ids[:, -input_ids.shape[1]:]

        attention_mask = kwargs.get("attention_mask", None)
        position_ids = kwargs.get("position_ids", None)

        if attention_mask is not None and position_ids is None:
            # create position_ids on the fly for batch generation
            position_ids = attention_mask.long().cumsum(-1) - 1
            position_ids = position_ids.masked_fill(attention_mask == 0, 1)
            if past_key_values:
                position_ids = position_ids[:, -input_ids.shape[1]:]
        else:
            position_ids = None

        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
        if inputs_embeds is not None and past_key_values is None:
            model_inputs = {"inputs_embeds": inputs_embeds}
        else:
            model_inputs = {"input_ids": input_ids}

        model_inputs.update(
            {
                "past_key_values": past_key_values,
                "use_cache": kwargs.get("use_cache"),
                "position_ids": position_ids,
                "attention_mask": attention_mask,
                "token_type_ids": token_type_ids,
            }
        )
        return model_inputs

    def forward(
        self,
        input_ids: Optional[mindspore.Tensor] = None,
        past_key_values: Optional[Tuple[Tuple[mindspore.Tensor]]] = None,
        attention_mask: Optional[mindspore.Tensor] = None,
        token_type_ids: Optional[mindspore.Tensor] = None,
        position_ids: Optional[mindspore.Tensor] = None,
        head_mask: Optional[mindspore.Tensor] = None,
        inputs_embeds: Optional[mindspore.Tensor] = None,
        encoder_hidden_states: Optional[mindspore.Tensor] = None,
        encoder_attention_mask: Optional[mindspore.Tensor] = None,
        labels: Optional[mindspore.Tensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, CausalLMOutputWithCrossAttentions]:
        r"""
        Args:
            labels (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
                `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
                are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        transformer_outputs = self.transformer(
            input_ids,
            past_key_values=past_key_values,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            encoder_hidden_states=encoder_hidden_states,
            encoder_attention_mask=encoder_attention_mask,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_states = transformer_outputs[0]

        lm_logits = self.lm_head(hidden_states)

        loss = None
        if labels is not None:
            # Shift so that tokens < n predict n
            shift_logits = lm_logits[..., :-1, :]
            shift_labels = labels[..., 1:]
            # Flatten the tokens
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(
                shift_logits.view(-1, shift_logits.shape[-1]), shift_labels.view(-1).to(mindspore.int32))

        if not return_dict:
            output = (lm_logits,) + transformer_outputs[1:]
            return ((loss,) + output) if loss is not None else output

        return CausalLMOutputWithCrossAttentions(
            loss=loss,
            logits=lm_logits,
            past_key_values=transformer_outputs.past_key_values,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
            cross_attentions=transformer_outputs.cross_attentions,
        )

    @staticmethod
    def _reorder_cache(
        past_key_values: Tuple[Tuple[mindspore.Tensor]], beam_idx: mindspore.Tensor
    ) -> Tuple[Tuple[mindspore.Tensor]]:
        """
        This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
        [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
        beam_idx at every generation step.
        """
        return tuple(layer_past.index_select(0, beam_idx) for layer_past in past_key_values)

mindnlp.transformers.models.gpt_bigcode.gpt_bigcode.GPTBigCodeForCausalLM.__init__(config)

Initializes the GPTBigCodeForCausalLM class.

PARAMETER DESCRIPTION
self

The instance of the class.

TYPE: object

config

A configuration object containing settings for the GPTBigCodeForCausalLM model. It should include parameters such as n_embd (embedding dimension) and vocab_size (vocabulary size).

TYPE: object

RETURNS DESCRIPTION

None.

Source code in mindnlp/transformers/models/gpt_bigcode/gpt_bigcode.py
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
def __init__(self, config):
    """
    Initializes the GPTBigCodeForCausalLM class.

    Args:
        self (object): The instance of the class.
        config (object): A configuration object containing settings for the GPTBigCodeForCausalLM model.
            It should include parameters such as n_embd (embedding dimension) and vocab_size (vocabulary size).

    Returns:
        None.

    Raises:
        None.
    """
    super().__init__(config)
    self.transformer = GPTBigCodeModel(config)
    self.lm_head = nn.Linear(
        config.n_embd, config.vocab_size, bias=False)

    # Initialize weights and apply final processing
    self.post_init()

mindnlp.transformers.models.gpt_bigcode.gpt_bigcode.GPTBigCodeForCausalLM.forward(input_ids=None, past_key_values=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, encoder_hidden_states=None, encoder_attention_mask=None, labels=None, use_cache=None, output_attentions=None, output_hidden_states=None, return_dict=None)

PARAMETER DESCRIPTION
labels

Labels for language modeling. Note that the labels are shifted inside the model, i.e. you can set labels = input_ids Indices are selected in [-100, 0, ..., config.vocab_size] All labels set to -100 are ignored (masked), the loss is only computed for labels in [0, ..., config.vocab_size]

TYPE: `torch.Tensor` of shape `(batch_size, sequence_length)`, *optional* DEFAULT: None

Source code in mindnlp/transformers/models/gpt_bigcode/gpt_bigcode.py
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
def forward(
    self,
    input_ids: Optional[mindspore.Tensor] = None,
    past_key_values: Optional[Tuple[Tuple[mindspore.Tensor]]] = None,
    attention_mask: Optional[mindspore.Tensor] = None,
    token_type_ids: Optional[mindspore.Tensor] = None,
    position_ids: Optional[mindspore.Tensor] = None,
    head_mask: Optional[mindspore.Tensor] = None,
    inputs_embeds: Optional[mindspore.Tensor] = None,
    encoder_hidden_states: Optional[mindspore.Tensor] = None,
    encoder_attention_mask: Optional[mindspore.Tensor] = None,
    labels: Optional[mindspore.Tensor] = None,
    use_cache: Optional[bool] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    return_dict: Optional[bool] = None,
) -> Union[Tuple, CausalLMOutputWithCrossAttentions]:
    r"""
    Args:
        labels (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
    """
    return_dict = return_dict if return_dict is not None else self.config.use_return_dict

    transformer_outputs = self.transformer(
        input_ids,
        past_key_values=past_key_values,
        attention_mask=attention_mask,
        token_type_ids=token_type_ids,
        position_ids=position_ids,
        head_mask=head_mask,
        inputs_embeds=inputs_embeds,
        encoder_hidden_states=encoder_hidden_states,
        encoder_attention_mask=encoder_attention_mask,
        use_cache=use_cache,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        return_dict=return_dict,
    )
    hidden_states = transformer_outputs[0]

    lm_logits = self.lm_head(hidden_states)

    loss = None
    if labels is not None:
        # Shift so that tokens < n predict n
        shift_logits = lm_logits[..., :-1, :]
        shift_labels = labels[..., 1:]
        # Flatten the tokens
        loss_fct = nn.CrossEntropyLoss()
        loss = loss_fct(
            shift_logits.view(-1, shift_logits.shape[-1]), shift_labels.view(-1).to(mindspore.int32))

    if not return_dict:
        output = (lm_logits,) + transformer_outputs[1:]
        return ((loss,) + output) if loss is not None else output

    return CausalLMOutputWithCrossAttentions(
        loss=loss,
        logits=lm_logits,
        past_key_values=transformer_outputs.past_key_values,
        hidden_states=transformer_outputs.hidden_states,
        attentions=transformer_outputs.attentions,
        cross_attentions=transformer_outputs.cross_attentions,
    )

mindnlp.transformers.models.gpt_bigcode.gpt_bigcode.GPTBigCodeForCausalLM.get_output_embeddings()

Returns the output embeddings of the GPTBigCodeForCausalLM model.

PARAMETER DESCRIPTION
self

The instance of the GPTBigCodeForCausalLM class.

TYPE: GPTBigCodeForCausalLM

RETURNS DESCRIPTION

None.

Source code in mindnlp/transformers/models/gpt_bigcode/gpt_bigcode.py
943
944
945
946
947
948
949
950
951
952
953
954
955
956
def get_output_embeddings(self):
    """
    Returns the output embeddings of the GPTBigCodeForCausalLM model.

    Args:
        self (GPTBigCodeForCausalLM): The instance of the GPTBigCodeForCausalLM class.

    Returns:
        None.

    Raises:
        None.
    """
    return self.lm_head

mindnlp.transformers.models.gpt_bigcode.gpt_bigcode.GPTBigCodeForCausalLM.prepare_inputs_for_generation(input_ids, past_key_values=None, inputs_embeds=None, **kwargs)

Prepare inputs for generation.

PARAMETER DESCRIPTION
self

An instance of the GPTBigCodeForCausalLM class.

TYPE: GPTBigCodeForCausalLM

input_ids

The input tensor of shape [batch_size, sequence_length].

TYPE: Tensor

past_key_values

The tuple of past key values. Default is None.

TYPE: tuple DEFAULT: None

inputs_embeds

The embedded inputs tensor of shape [batch_size, sequence_length, embedding_size]. Default is None.

TYPE: Tensor DEFAULT: None

RETURNS DESCRIPTION
dict

A dictionary containing the model inputs for generation. The dictionary may contain the following keys:

  • 'inputs_embeds' (torch.Tensor): The embedded inputs tensor.
  • 'input_ids' (torch.Tensor): The input tensor.
  • 'past_key_values' (tuple): The tuple of past key values.
  • 'use_cache' (bool): Whether to use cache.
  • 'position_ids' (torch.Tensor): The position ids tensor.
  • 'attention_mask' (torch.Tensor): The attention mask tensor.
  • 'token_type_ids' (torch.Tensor): The token type ids tensor.
Source code in mindnlp/transformers/models/gpt_bigcode/gpt_bigcode.py
 974
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
    '''
    Prepare inputs for generation.

    Args:
        self (GPTBigCodeForCausalLM): An instance of the GPTBigCodeForCausalLM class.
        input_ids (torch.Tensor): The input tensor of shape [batch_size, sequence_length].
        past_key_values (tuple, optional): The tuple of past key values. Default is None.
        inputs_embeds (torch.Tensor, optional):
            The embedded inputs tensor of shape [batch_size, sequence_length, embedding_size]. Default is None.

    Returns:
        dict: A dictionary containing the model inputs for generation.
            The dictionary may contain the following keys:

            - 'inputs_embeds' (torch.Tensor): The embedded inputs tensor.
            - 'input_ids' (torch.Tensor): The input tensor.
            - 'past_key_values' (tuple): The tuple of past key values.
            - 'use_cache' (bool): Whether to use cache.
            - 'position_ids' (torch.Tensor): The position ids tensor.
            - 'attention_mask' (torch.Tensor): The attention mask tensor.
            - 'token_type_ids' (torch.Tensor): The token type ids tensor.

    Raises:
        None.
    '''
    token_type_ids = kwargs.get("token_type_ids", None)
    # Omit tokens covered by past_key_values
    if past_key_values:
        if self.config.multi_query:
            past_length = past_key_values[0].shape[1]
        else:
            past_length = past_key_values[0].shape[2]

        # Some generation methods already pass only the last input ID
        if input_ids.shape[1] > past_length:
            remove_prefix_length = past_length
        else:
            # Default to old behavior: keep only final ID
            remove_prefix_length = input_ids.shape[1] - 1

        input_ids = input_ids[:, remove_prefix_length:]
        if token_type_ids is not None:
            token_type_ids = token_type_ids[:, -input_ids.shape[1]:]

    attention_mask = kwargs.get("attention_mask", None)
    position_ids = kwargs.get("position_ids", None)

    if attention_mask is not None and position_ids is None:
        # create position_ids on the fly for batch generation
        position_ids = attention_mask.long().cumsum(-1) - 1
        position_ids = position_ids.masked_fill(attention_mask == 0, 1)
        if past_key_values:
            position_ids = position_ids[:, -input_ids.shape[1]:]
    else:
        position_ids = None

    # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
    if inputs_embeds is not None and past_key_values is None:
        model_inputs = {"inputs_embeds": inputs_embeds}
    else:
        model_inputs = {"input_ids": input_ids}

    model_inputs.update(
        {
            "past_key_values": past_key_values,
            "use_cache": kwargs.get("use_cache"),
            "position_ids": position_ids,
            "attention_mask": attention_mask,
            "token_type_ids": token_type_ids,
        }
    )
    return model_inputs

mindnlp.transformers.models.gpt_bigcode.gpt_bigcode.GPTBigCodeForCausalLM.set_output_embeddings(new_embeddings)

Sets the output embeddings for the GPTBigCodeForCausalLM model.

PARAMETER DESCRIPTION
self

The instance of the GPTBigCodeForCausalLM class.

TYPE: GPTBigCodeForCausalLM

new_embeddings

The new embeddings to be set as output embeddings for the model.

TYPE: object

RETURNS DESCRIPTION

None.

Source code in mindnlp/transformers/models/gpt_bigcode/gpt_bigcode.py
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
def set_output_embeddings(self, new_embeddings):
    """
    Sets the output embeddings for the GPTBigCodeForCausalLM model.

    Args:
        self (GPTBigCodeForCausalLM): The instance of the GPTBigCodeForCausalLM class.
        new_embeddings (object): The new embeddings to be set as output embeddings for the model.

    Returns:
        None.

    Raises:
        None.
    """
    self.lm_head = new_embeddings

mindnlp.transformers.models.gpt_bigcode.gpt_bigcode.GPTBigCodeForSequenceClassification

Bases: GPTBigCodePreTrainedModel

GPT BigCode for Sequence Classification

Source code in mindnlp/transformers/models/gpt_bigcode/gpt_bigcode.py
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
class GPTBigCodeForSequenceClassification(GPTBigCodePreTrainedModel):
    """GPT BigCode for Sequence Classification"""
    def __init__(self, config):
        """
        Initializes a new instance of the GPTBigCodeForSequenceClassification class.

        Args:
            self: The object itself.
            config (GPTBigCodeConfig): The configuration object specifying the model's hyperparameters and settings.

        Returns:
            None

        Raises:
            None
        """
        super().__init__(config)
        self.num_labels = config.num_labels
        self.transformer = GPTBigCodeModel(config)
        self.score = nn.Linear(config.n_embd, self.num_labels, bias=False)

        # Initialize weights and apply final processing
        self.post_init()

    def forward(
        self,
        input_ids: Optional[mindspore.Tensor] = None,
        past_key_values: Optional[Tuple[Tuple[mindspore.Tensor]]] = None,
        attention_mask: Optional[mindspore.Tensor] = None,
        token_type_ids: Optional[mindspore.Tensor] = None,
        position_ids: Optional[mindspore.Tensor] = None,
        head_mask: Optional[mindspore.Tensor] = None,
        inputs_embeds: Optional[mindspore.Tensor] = None,
        labels: Optional[mindspore.Tensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
        r"""
        Args:
            labels (`torch.Tensor` of shape `(batch_size,)`, *optional*):
                Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
                config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
                `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        transformer_outputs = self.transformer(
            input_ids,
            past_key_values=past_key_values,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_states = transformer_outputs[0]
        logits = self.score(hidden_states)

        if input_ids is not None:
            batch_size, _ = input_ids.shape[:2]
        else:
            batch_size, _ = inputs_embeds.shape[:2]

        assert (
            self.config.pad_token_id is not None or batch_size == 1
        ), "Cannot handle batch sizes > 1 if no padding token is defined."
        if self.config.pad_token_id is None:
            sequence_lengths = -1
        else:
            if input_ids is not None:
                sequence_lengths = ops.ne(
                    input_ids, self.config.pad_token_id).sum(-1) - 1
            else:
                sequence_lengths = -1

        pooled_logits = logits[ops.arange(batch_size), sequence_lengths]

        loss = None
        if labels is not None:

            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and labels.dtype in (mindspore.int64, mindspore.int32):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = nn.MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(pooled_logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = nn.CrossEntropyLoss()
                loss = loss_fct(
                    pooled_logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = nn.BCEWithLogitsLoss()
                loss = loss_fct(pooled_logits, labels)
        if not return_dict:
            output = (pooled_logits,) + transformer_outputs[1:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutputWithPast(
            loss=loss,
            logits=pooled_logits,
            past_key_values=transformer_outputs.past_key_values,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
        )

mindnlp.transformers.models.gpt_bigcode.gpt_bigcode.GPTBigCodeForSequenceClassification.__init__(config)

Initializes a new instance of the GPTBigCodeForSequenceClassification class.

PARAMETER DESCRIPTION
self

The object itself.

config

The configuration object specifying the model's hyperparameters and settings.

TYPE: GPTBigCodeConfig

RETURNS DESCRIPTION

None

Source code in mindnlp/transformers/models/gpt_bigcode/gpt_bigcode.py
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
def __init__(self, config):
    """
    Initializes a new instance of the GPTBigCodeForSequenceClassification class.

    Args:
        self: The object itself.
        config (GPTBigCodeConfig): The configuration object specifying the model's hyperparameters and settings.

    Returns:
        None

    Raises:
        None
    """
    super().__init__(config)
    self.num_labels = config.num_labels
    self.transformer = GPTBigCodeModel(config)
    self.score = nn.Linear(config.n_embd, self.num_labels, bias=False)

    # Initialize weights and apply final processing
    self.post_init()

mindnlp.transformers.models.gpt_bigcode.gpt_bigcode.GPTBigCodeForSequenceClassification.forward(input_ids=None, past_key_values=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, use_cache=None, output_attentions=None, output_hidden_states=None, return_dict=None)

PARAMETER DESCRIPTION
labels

Labels for computing the sequence classification/regression loss. Indices should be in [0, ..., config.num_labels - 1]. If config.num_labels == 1 a regression loss is computed (Mean-Square loss), If config.num_labels > 1 a classification loss is computed (Cross-Entropy).

TYPE: `torch.Tensor` of shape `(batch_size,)`, *optional* DEFAULT: None

Source code in mindnlp/transformers/models/gpt_bigcode/gpt_bigcode.py
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
def forward(
    self,
    input_ids: Optional[mindspore.Tensor] = None,
    past_key_values: Optional[Tuple[Tuple[mindspore.Tensor]]] = None,
    attention_mask: Optional[mindspore.Tensor] = None,
    token_type_ids: Optional[mindspore.Tensor] = None,
    position_ids: Optional[mindspore.Tensor] = None,
    head_mask: Optional[mindspore.Tensor] = None,
    inputs_embeds: Optional[mindspore.Tensor] = None,
    labels: Optional[mindspore.Tensor] = None,
    use_cache: Optional[bool] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    return_dict: Optional[bool] = None,
) -> Union[Tuple, SequenceClassifierOutputWithPast]:
    r"""
    Args:
        labels (`torch.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
    """
    return_dict = return_dict if return_dict is not None else self.config.use_return_dict

    transformer_outputs = self.transformer(
        input_ids,
        past_key_values=past_key_values,
        attention_mask=attention_mask,
        token_type_ids=token_type_ids,
        position_ids=position_ids,
        head_mask=head_mask,
        inputs_embeds=inputs_embeds,
        use_cache=use_cache,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        return_dict=return_dict,
    )
    hidden_states = transformer_outputs[0]
    logits = self.score(hidden_states)

    if input_ids is not None:
        batch_size, _ = input_ids.shape[:2]
    else:
        batch_size, _ = inputs_embeds.shape[:2]

    assert (
        self.config.pad_token_id is not None or batch_size == 1
    ), "Cannot handle batch sizes > 1 if no padding token is defined."
    if self.config.pad_token_id is None:
        sequence_lengths = -1
    else:
        if input_ids is not None:
            sequence_lengths = ops.ne(
                input_ids, self.config.pad_token_id).sum(-1) - 1
        else:
            sequence_lengths = -1

    pooled_logits = logits[ops.arange(batch_size), sequence_lengths]

    loss = None
    if labels is not None:

        if self.config.problem_type is None:
            if self.num_labels == 1:
                self.config.problem_type = "regression"
            elif self.num_labels > 1 and labels.dtype in (mindspore.int64, mindspore.int32):
                self.config.problem_type = "single_label_classification"
            else:
                self.config.problem_type = "multi_label_classification"

        if self.config.problem_type == "regression":
            loss_fct = nn.MSELoss()
            if self.num_labels == 1:
                loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
            else:
                loss = loss_fct(pooled_logits, labels)
        elif self.config.problem_type == "single_label_classification":
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(
                pooled_logits.view(-1, self.num_labels), labels.view(-1))
        elif self.config.problem_type == "multi_label_classification":
            loss_fct = nn.BCEWithLogitsLoss()
            loss = loss_fct(pooled_logits, labels)
    if not return_dict:
        output = (pooled_logits,) + transformer_outputs[1:]
        return ((loss,) + output) if loss is not None else output

    return SequenceClassifierOutputWithPast(
        loss=loss,
        logits=pooled_logits,
        past_key_values=transformer_outputs.past_key_values,
        hidden_states=transformer_outputs.hidden_states,
        attentions=transformer_outputs.attentions,
    )

mindnlp.transformers.models.gpt_bigcode.gpt_bigcode.GPTBigCodeForTokenClassification

Bases: GPTBigCodePreTrainedModel

GPT BigCode for Token Classification

Source code in mindnlp/transformers/models/gpt_bigcode/gpt_bigcode.py
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
class GPTBigCodeForTokenClassification(GPTBigCodePreTrainedModel):
    """GPT BigCode for Token Classification"""
    def __init__(self, config):
        """
        Initializes an instance of the GPTBigCodeForTokenClassification class.

        Args:
            self: The instance of the class.
            config: An object containing configuration settings for the model.
                It must have the following attributes:

                - num_labels: An integer specifying the number of output labels.
                - classifier_dropout: (optional) A float specifying the dropout rate for the classifier layer.
                - hidden_dropout: (optional) A float specifying the dropout rate for hidden layers.

                Note:
                    If both classifier_dropout and hidden_dropout are provided, classifier_dropout takes precedence.

        Returns:
            None.

        Raises:
            None.
        """
        super().__init__(config)
        self.num_labels = config.num_labels

        self.transformer = GPTBigCodeModel(config)
        if hasattr(config, "classifier_dropout") and config.classifier_dropout is not None:
            classifier_dropout = config.classifier_dropout
        elif hasattr(config, "hidden_dropout") and config.hidden_dropout is not None:
            classifier_dropout = config.hidden_dropout
        else:
            classifier_dropout = 0.1
        self.dropout = nn.Dropout(p=classifier_dropout)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

        # Initialize weights and apply final processing
        self.post_init()

    def forward(
        self,
        input_ids: Optional[mindspore.Tensor] = None,
        past_key_values: Optional[Tuple[Tuple[mindspore.Tensor]]] = None,
        attention_mask: Optional[mindspore.Tensor] = None,
        token_type_ids: Optional[mindspore.Tensor] = None,
        position_ids: Optional[mindspore.Tensor] = None,
        head_mask: Optional[mindspore.Tensor] = None,
        inputs_embeds: Optional[mindspore.Tensor] = None,
        labels: Optional[mindspore.Tensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, TokenClassifierOutput]:
        r"""
        Args:
            labels (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
                config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
                `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        transformer_outputs = self.transformer(
            input_ids,
            past_key_values=past_key_values,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        hidden_states = transformer_outputs[0]
        hidden_states = self.dropout(hidden_states)
        logits = self.classifier(hidden_states)

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        if not return_dict:
            output = (logits,) + transformer_outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
        )

mindnlp.transformers.models.gpt_bigcode.gpt_bigcode.GPTBigCodeForTokenClassification.__init__(config)

Initializes an instance of the GPTBigCodeForTokenClassification class.

PARAMETER DESCRIPTION
self

The instance of the class.

config

An object containing configuration settings for the model. It must have the following attributes:

  • num_labels: An integer specifying the number of output labels.
  • classifier_dropout: (optional) A float specifying the dropout rate for the classifier layer.
  • hidden_dropout: (optional) A float specifying the dropout rate for hidden layers.

Note: If both classifier_dropout and hidden_dropout are provided, classifier_dropout takes precedence.

RETURNS DESCRIPTION

None.

Source code in mindnlp/transformers/models/gpt_bigcode/gpt_bigcode.py
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
def __init__(self, config):
    """
    Initializes an instance of the GPTBigCodeForTokenClassification class.

    Args:
        self: The instance of the class.
        config: An object containing configuration settings for the model.
            It must have the following attributes:

            - num_labels: An integer specifying the number of output labels.
            - classifier_dropout: (optional) A float specifying the dropout rate for the classifier layer.
            - hidden_dropout: (optional) A float specifying the dropout rate for hidden layers.

            Note:
                If both classifier_dropout and hidden_dropout are provided, classifier_dropout takes precedence.

    Returns:
        None.

    Raises:
        None.
    """
    super().__init__(config)
    self.num_labels = config.num_labels

    self.transformer = GPTBigCodeModel(config)
    if hasattr(config, "classifier_dropout") and config.classifier_dropout is not None:
        classifier_dropout = config.classifier_dropout
    elif hasattr(config, "hidden_dropout") and config.hidden_dropout is not None:
        classifier_dropout = config.hidden_dropout
    else:
        classifier_dropout = 0.1
    self.dropout = nn.Dropout(p=classifier_dropout)
    self.classifier = nn.Linear(config.hidden_size, config.num_labels)

    # Initialize weights and apply final processing
    self.post_init()

mindnlp.transformers.models.gpt_bigcode.gpt_bigcode.GPTBigCodeForTokenClassification.forward(input_ids=None, past_key_values=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, use_cache=None, output_attentions=None, output_hidden_states=None, return_dict=None)

PARAMETER DESCRIPTION
labels

Labels for computing the sequence classification/regression loss. Indices should be in [0, ..., config.num_labels - 1]. If config.num_labels == 1 a regression loss is computed (Mean-Square loss), If config.num_labels > 1 a classification loss is computed (Cross-Entropy).

TYPE: `torch.Tensor` of shape `(batch_size, sequence_length)`, *optional* DEFAULT: None

Source code in mindnlp/transformers/models/gpt_bigcode/gpt_bigcode.py
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
def forward(
    self,
    input_ids: Optional[mindspore.Tensor] = None,
    past_key_values: Optional[Tuple[Tuple[mindspore.Tensor]]] = None,
    attention_mask: Optional[mindspore.Tensor] = None,
    token_type_ids: Optional[mindspore.Tensor] = None,
    position_ids: Optional[mindspore.Tensor] = None,
    head_mask: Optional[mindspore.Tensor] = None,
    inputs_embeds: Optional[mindspore.Tensor] = None,
    labels: Optional[mindspore.Tensor] = None,
    use_cache: Optional[bool] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    return_dict: Optional[bool] = None,
) -> Union[Tuple, TokenClassifierOutput]:
    r"""
    Args:
        labels (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
    """
    return_dict = return_dict if return_dict is not None else self.config.use_return_dict

    transformer_outputs = self.transformer(
        input_ids,
        past_key_values=past_key_values,
        attention_mask=attention_mask,
        token_type_ids=token_type_ids,
        position_ids=position_ids,
        head_mask=head_mask,
        inputs_embeds=inputs_embeds,
        use_cache=use_cache,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        return_dict=return_dict,
    )

    hidden_states = transformer_outputs[0]
    hidden_states = self.dropout(hidden_states)
    logits = self.classifier(hidden_states)

    loss = None
    if labels is not None:
        loss_fct = nn.CrossEntropyLoss()
        loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

    if not return_dict:
        output = (logits,) + transformer_outputs[2:]
        return ((loss,) + output) if loss is not None else output

    return TokenClassifierOutput(
        loss=loss,
        logits=logits,
        hidden_states=transformer_outputs.hidden_states,
        attentions=transformer_outputs.attentions,
    )

mindnlp.transformers.models.gpt_bigcode.gpt_bigcode.GPTBigCodeMLP

Bases: Module

GPT BigCode MLP

Source code in mindnlp/transformers/models/gpt_bigcode/gpt_bigcode.py
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
class GPTBigCodeMLP(nn.Module):
    """GPT BigCode MLP"""
    def __init__(self, intermediate_size, config):
        """
        Initializes an instance of the GPTBigCodeMLP class.

        Args:
            self: The object itself.
            intermediate_size (int): The size of the intermediate layer.
            config (object): The configuration object with various settings for the model.

        Returns:
            None

        Raises:
            None
        """
        super().__init__()
        embed_dim = config.hidden_size
        self.c_fc = nn.Linear(embed_dim, intermediate_size)
        self.c_proj = nn.Linear(intermediate_size, embed_dim)
        self.act = ACT2FN[config.activation_function]
        self.dropout = nn.Dropout(p=config.resid_pdrop)

    def forward(self, hidden_states: Optional[Tuple[mindspore.Tensor]]) -> mindspore.Tensor:
        """
        This method forwards a multi-layer perceptron for the GPT (Generative Pretrained Transformer) model
        using the provided hidden states.

        Args:
            self: The instance of the GPTBigCodeMLP class.
            hidden_states (Optional[Tuple[mindspore.Tensor]]):
                The hidden states to be processed by the multi-layer perceptron.
                It is an optional tuple of mindspore.Tensor containing the input hidden states.
                If not provided, the method will default to None.

        Returns:
            mindspore.Tensor:
                A tensor representing the processed hidden states after passing through the multi-layer perceptron.

        Raises:
            None
        """
        hidden_states = self.c_fc(hidden_states)
        hidden_states = self.act(hidden_states)
        hidden_states = self.c_proj(hidden_states)
        hidden_states = self.dropout(hidden_states)
        return hidden_states

mindnlp.transformers.models.gpt_bigcode.gpt_bigcode.GPTBigCodeMLP.__init__(intermediate_size, config)

Initializes an instance of the GPTBigCodeMLP class.

PARAMETER DESCRIPTION
self

The object itself.

intermediate_size

The size of the intermediate layer.

TYPE: int

config

The configuration object with various settings for the model.

TYPE: object

RETURNS DESCRIPTION

None

Source code in mindnlp/transformers/models/gpt_bigcode/gpt_bigcode.py
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
def __init__(self, intermediate_size, config):
    """
    Initializes an instance of the GPTBigCodeMLP class.

    Args:
        self: The object itself.
        intermediate_size (int): The size of the intermediate layer.
        config (object): The configuration object with various settings for the model.

    Returns:
        None

    Raises:
        None
    """
    super().__init__()
    embed_dim = config.hidden_size
    self.c_fc = nn.Linear(embed_dim, intermediate_size)
    self.c_proj = nn.Linear(intermediate_size, embed_dim)
    self.act = ACT2FN[config.activation_function]
    self.dropout = nn.Dropout(p=config.resid_pdrop)

mindnlp.transformers.models.gpt_bigcode.gpt_bigcode.GPTBigCodeMLP.forward(hidden_states)

This method forwards a multi-layer perceptron for the GPT (Generative Pretrained Transformer) model using the provided hidden states.

PARAMETER DESCRIPTION
self

The instance of the GPTBigCodeMLP class.

hidden_states

The hidden states to be processed by the multi-layer perceptron. It is an optional tuple of mindspore.Tensor containing the input hidden states. If not provided, the method will default to None.

TYPE: Optional[Tuple[Tensor]]

RETURNS DESCRIPTION
Tensor

mindspore.Tensor: A tensor representing the processed hidden states after passing through the multi-layer perceptron.

Source code in mindnlp/transformers/models/gpt_bigcode/gpt_bigcode.py
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
def forward(self, hidden_states: Optional[Tuple[mindspore.Tensor]]) -> mindspore.Tensor:
    """
    This method forwards a multi-layer perceptron for the GPT (Generative Pretrained Transformer) model
    using the provided hidden states.

    Args:
        self: The instance of the GPTBigCodeMLP class.
        hidden_states (Optional[Tuple[mindspore.Tensor]]):
            The hidden states to be processed by the multi-layer perceptron.
            It is an optional tuple of mindspore.Tensor containing the input hidden states.
            If not provided, the method will default to None.

    Returns:
        mindspore.Tensor:
            A tensor representing the processed hidden states after passing through the multi-layer perceptron.

    Raises:
        None
    """
    hidden_states = self.c_fc(hidden_states)
    hidden_states = self.act(hidden_states)
    hidden_states = self.c_proj(hidden_states)
    hidden_states = self.dropout(hidden_states)
    return hidden_states

mindnlp.transformers.models.gpt_bigcode.gpt_bigcode.GPTBigCodeModel

Bases: GPTBigCodePreTrainedModel

GPT BigCode Model

Source code in mindnlp/transformers/models/gpt_bigcode/gpt_bigcode.py
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
class GPTBigCodeModel(GPTBigCodePreTrainedModel):
    """GPT BigCode Model"""
    def __init__(self, config):
        """
        __init__

        Initializes the GPTBigCodeModel class.

        Args:
            self(GPTBigCodeModel):
                The instance of the GPTBigCodeModel class.
            config(Config):  An instance of the Config class containing configuration parameters for the model.
                The configuration parameters include:

                - multi_query: bool
                Specifies if the model supports multiple queries.
                - hidden_size: int
                Specifies the dimension of the hidden layers.
                - vocab_size: int
                Specifies the size of the vocabulary.
                - max_position_embeddings: int
                Specifies the maximum number of positions for embeddings.
                - embd_pdrop: float
                Specifies the dropout probability for the embeddings.
                - num_hidden_layers: int
                Specifies the number of hidden layers in the model.
                - layer_norm_epsilon: float
                Specifies the epsilon value for layer normalization.

        Returns:
            None.

        Raises:
            None.
        """
        super().__init__(config)
        self.multi_query = config.multi_query
        self.embed_dim = config.hidden_size

        self.wte = nn.Embedding(config.vocab_size, self.embed_dim)
        self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim)

        self.drop = nn.Dropout(p=config.embd_pdrop)
        self.h = nn.ModuleList([GPTBigCodeBlock(config, layer_idx=i)
                              for i in range(config.num_hidden_layers)])
        self.ln_f = nn.LayerNorm(
            [self.embed_dim], eps=config.layer_norm_epsilon)

        self.gradient_checkpointing = False

        max_positions = config.max_position_embeddings
        self.bias = Tensor(
            np.tril(np.ones((max_positions, max_positions))), mindspore.bool_)

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        """
        This method returns the input embeddings for the GPTBigCodeModel.

        Args:
            self: The instance of the GPTBigCodeModel class.

        Returns:
            None: This method returns the input embeddings which are of type None.

        Raises:
            None.
        """
        return self.wte

    def set_input_embeddings(self, new_embeddings):
        """
        Sets the input embeddings for the GPTBigCodeModel.

        Args:
            self (GPTBigCodeModel): The instance of the GPTBigCodeModel class.
            new_embeddings (object): The new input embeddings to be set for the model.
                It can be of any valid type.

        Returns:
            None.

        Raises:
            None.
        """
        self.wte = new_embeddings

    def forward(
        self,
        input_ids: Optional[mindspore.Tensor] = None,
        past_key_values: Optional[List[mindspore.Tensor]] = None,
        attention_mask: Optional[mindspore.Tensor] = None,
        token_type_ids: Optional[mindspore.Tensor] = None,
        position_ids: Optional[mindspore.Tensor] = None,
        head_mask: Optional[mindspore.Tensor] = None,
        inputs_embeds: Optional[mindspore.Tensor] = None,
        encoder_hidden_states: Optional[mindspore.Tensor] = None,
        encoder_attention_mask: Optional[mindspore.Tensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
        """
        Constructs the GPTBigCodeModel.

        Args:
            self (GPTBigCodeModel): The instance of the GPTBigCodeModel class.
            input_ids (Optional[mindspore.Tensor], optional): The input sequence tensor. Defaults to None.
            past_key_values (Optional[List[mindspore.Tensor]], optional):
                List of tensors containing the past key values of the model. Defaults to None.
            attention_mask (Optional[mindspore.Tensor], optional): The attention mask tensor. Defaults to None.
            token_type_ids (Optional[mindspore.Tensor], optional): The token type ids tensor. Defaults to None.
            position_ids (Optional[mindspore.Tensor], optional): The position ids tensor. Defaults to None.
            head_mask (Optional[mindspore.Tensor], optional): The head mask tensor. Defaults to None.
            inputs_embeds (Optional[mindspore.Tensor], optional): The input embeddings tensor. Defaults to None.
            encoder_hidden_states (Optional[mindspore.Tensor], optional): The hidden states of the encoder.
                Defaults to None.
            encoder_attention_mask (Optional[mindspore.Tensor], optional): The attention mask for the encoder.
                Defaults to None.
            use_cache (Optional[bool], optional): Whether to use cache. Defaults to None.
            output_attentions (Optional[bool], optional): Whether to output attentions. Defaults to None.
            output_hidden_states (Optional[bool], optional): Whether to output hidden states. Defaults to None.
            return_dict (Optional[bool], optional): Whether to return a dictionary. Defaults to None.

        Returns:
            Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]: The output of the GPTBigCodeModel.
                Returns a tuple or a BaseModelOutputWithPastAndCrossAttentions object depending on the value of
                return_dict.

        Raises:
            ValueError: If both input_ids and inputs_embeds are specified.
            ValueError: If neither input_ids nor inputs_embeds are specified.
            ValueError: If batch_size is less than or equal to 0.
            AssertionError: If the encoder_attention_mask has an invalid dimension.

        """
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        use_cache = use_cache if use_cache is not None else self.config.use_cache
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if input_ids is not None and inputs_embeds is not None:
            raise ValueError(
                "You cannot specify both input_ids and inputs_embeds at the same time")

        if input_ids is not None:
            self.warn_if_padding_and_no_attention_mask(
                input_ids, attention_mask)
            input_shape = input_ids.shape
            input_ids = input_ids.view(-1, input_shape[-1])
            batch_size = input_ids.shape[0]
        elif inputs_embeds is not None:
            input_shape = inputs_embeds.shape[:-1]
            batch_size = inputs_embeds.shape[0]
        else:
            raise ValueError(
                "You have to specify either input_ids or inputs_embeds")

        if batch_size <= 0:
            raise ValueError("batch_size has to be defined and > 0")

        if token_type_ids is not None:
            token_type_ids = token_type_ids.view(-1, input_shape[-1])

        if past_key_values is None:
            past_length = 0
            past_key_values = tuple([None] * len(self.h))
        else:
            past_length = past_key_values[0].shape[-2]

        if attention_mask is not None and len(attention_mask.shape) == 2 and position_ids is None:
            # create position_ids on the fly for batch generation
            position_ids = attention_mask.long().cumsum(-1) - 1
            position_ids = position_ids.masked_fill(attention_mask == 0, 1)
            if past_length > 0:
                position_ids = position_ids[:,
                                            past_length: input_shape[-1] + past_length:]
        elif position_ids is None:
            position_ids = ops.arange(
                past_length, input_shape[-1] + past_length, dtype=mindspore.int64)
            position_ids = position_ids.unsqueeze(0)

        # Self-attention mask.
        query_length = input_shape[-1]
        key_length = past_length + query_length
        self_attention_mask = self.bias[None, key_length - query_length: key_length, :key_length]

        if attention_mask is not None:
            self_attention_mask = self_attention_mask * \
                attention_mask.view(batch_size, 1, -1)
            self_attention_mask = self_attention_mask.bool()

        # MQA models: (batch_size, query_length, n_heads, key_length)
        # MHA models: (batch_size, n_heads, query_length, key_length)
        attention_mask = ops.unsqueeze(
            self_attention_mask, 2 if self.multi_query else 1)

        # If a 2D or 3D attention mask is provided for the cross-attention
        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
        if (
            self.config.add_cross_attention
            and encoder_hidden_states is not None
            and encoder_attention_mask is not None
        ):
            if encoder_attention_mask.dim() == 2:
                encoder_attention_mask.unsqueeze(1)
            assert encoder_attention_mask.dim() == 3
            encoder_attention_mask = encoder_attention_mask.bool(
            ).unsqueeze(2 if self.multi_query else 1)
        else:
            encoder_attention_mask = None

        # Prepare head mask if needed
        # 1.0 in head_mask indicate we keep the head
        # attention_probs has shape bsz x n_heads x N x N
        # head_mask has shape n_layer x batch x n_heads x N x N
        head_mask = self.get_head_mask(head_mask, self.config.n_layer)

        if inputs_embeds is None:
            inputs_embeds = self.wte(input_ids)
        position_embeds = self.wpe(position_ids)
        hidden_states = inputs_embeds + position_embeds

        if token_type_ids is not None:
            token_type_embeds = self.wte(token_type_ids)
            hidden_states = hidden_states + token_type_embeds

        hidden_states = self.drop(hidden_states)
        output_shape = input_shape + (hidden_states.shape[-1],)

        presents = [] if use_cache else None
        all_self_attentions = () if output_attentions else None
        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
        all_hidden_states = () if output_hidden_states else None
        for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            outputs = block(
                hidden_states,
                layer_past=layer_past,
                attention_mask=attention_mask,
                head_mask=head_mask[i],
                encoder_hidden_states=encoder_hidden_states,
                encoder_attention_mask=encoder_attention_mask,
                use_cache=use_cache,
                output_attentions=output_attentions,
            )

            hidden_states = outputs[0]
            if use_cache:
                presents.append(outputs[1])

            if output_attentions:
                all_self_attentions = all_self_attentions + \
                    (outputs[2 if use_cache else 1],)
                if self.config.add_cross_attention:
                    all_cross_attentions = all_cross_attentions + \
                        (outputs[3 if use_cache else 2],)

        hidden_states = self.ln_f(hidden_states)

        hidden_states = hidden_states.view(output_shape)
        # Add last hidden state
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        if not return_dict:
            return tuple(
                v
                for v in [hidden_states, presents, all_hidden_states, all_self_attentions, all_cross_attentions]
                if v is not None
            )

        return BaseModelOutputWithPastAndCrossAttentions(
            last_hidden_state=hidden_states,
            past_key_values=presents,
            hidden_states=all_hidden_states,
            attentions=all_self_attentions,
            cross_attentions=all_cross_attentions,
        )

mindnlp.transformers.models.gpt_bigcode.gpt_bigcode.GPTBigCodeModel.__init__(config)

init

Initializes the GPTBigCodeModel class.

PARAMETER DESCRIPTION
self(GPTBigCodeModel)

The instance of the GPTBigCodeModel class.

config(Config)

An instance of the Config class containing configuration parameters for the model. The configuration parameters include:

  • multi_query: bool Specifies if the model supports multiple queries.
  • hidden_size: int Specifies the dimension of the hidden layers.
  • vocab_size: int Specifies the size of the vocabulary.
  • max_position_embeddings: int Specifies the maximum number of positions for embeddings.
  • embd_pdrop: float Specifies the dropout probability for the embeddings.
  • num_hidden_layers: int Specifies the number of hidden layers in the model.
  • layer_norm_epsilon: float Specifies the epsilon value for layer normalization.

RETURNS DESCRIPTION

None.

Source code in mindnlp/transformers/models/gpt_bigcode/gpt_bigcode.py
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
def __init__(self, config):
    """
    __init__

    Initializes the GPTBigCodeModel class.

    Args:
        self(GPTBigCodeModel):
            The instance of the GPTBigCodeModel class.
        config(Config):  An instance of the Config class containing configuration parameters for the model.
            The configuration parameters include:

            - multi_query: bool
            Specifies if the model supports multiple queries.
            - hidden_size: int
            Specifies the dimension of the hidden layers.
            - vocab_size: int
            Specifies the size of the vocabulary.
            - max_position_embeddings: int
            Specifies the maximum number of positions for embeddings.
            - embd_pdrop: float
            Specifies the dropout probability for the embeddings.
            - num_hidden_layers: int
            Specifies the number of hidden layers in the model.
            - layer_norm_epsilon: float
            Specifies the epsilon value for layer normalization.

    Returns:
        None.

    Raises:
        None.
    """
    super().__init__(config)
    self.multi_query = config.multi_query
    self.embed_dim = config.hidden_size

    self.wte = nn.Embedding(config.vocab_size, self.embed_dim)
    self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim)

    self.drop = nn.Dropout(p=config.embd_pdrop)
    self.h = nn.ModuleList([GPTBigCodeBlock(config, layer_idx=i)
                          for i in range(config.num_hidden_layers)])
    self.ln_f = nn.LayerNorm(
        [self.embed_dim], eps=config.layer_norm_epsilon)

    self.gradient_checkpointing = False

    max_positions = config.max_position_embeddings
    self.bias = Tensor(
        np.tril(np.ones((max_positions, max_positions))), mindspore.bool_)

    # Initialize weights and apply final processing
    self.post_init()

mindnlp.transformers.models.gpt_bigcode.gpt_bigcode.GPTBigCodeModel.forward(input_ids=None, past_key_values=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, encoder_hidden_states=None, encoder_attention_mask=None, use_cache=None, output_attentions=None, output_hidden_states=None, return_dict=None)

Constructs the GPTBigCodeModel.

PARAMETER DESCRIPTION
self

The instance of the GPTBigCodeModel class.

TYPE: GPTBigCodeModel

input_ids

The input sequence tensor. Defaults to None.

TYPE: Optional[Tensor] DEFAULT: None

past_key_values

List of tensors containing the past key values of the model. Defaults to None.

TYPE: Optional[List[Tensor]] DEFAULT: None

attention_mask

The attention mask tensor. Defaults to None.

TYPE: Optional[Tensor] DEFAULT: None

token_type_ids

The token type ids tensor. Defaults to None.

TYPE: Optional[Tensor] DEFAULT: None

position_ids

The position ids tensor. Defaults to None.

TYPE: Optional[Tensor] DEFAULT: None

head_mask

The head mask tensor. Defaults to None.

TYPE: Optional[Tensor] DEFAULT: None

inputs_embeds

The input embeddings tensor. Defaults to None.

TYPE: Optional[Tensor] DEFAULT: None

encoder_hidden_states

The hidden states of the encoder. Defaults to None.

TYPE: Optional[Tensor] DEFAULT: None

encoder_attention_mask

The attention mask for the encoder. Defaults to None.

TYPE: Optional[Tensor] DEFAULT: None

use_cache

Whether to use cache. Defaults to None.

TYPE: Optional[bool] DEFAULT: None

output_attentions

Whether to output attentions. Defaults to None.

TYPE: Optional[bool] DEFAULT: None

output_hidden_states

Whether to output hidden states. Defaults to None.

TYPE: Optional[bool] DEFAULT: None

return_dict

Whether to return a dictionary. Defaults to None.

TYPE: Optional[bool] DEFAULT: None

RETURNS DESCRIPTION
Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]

Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]: The output of the GPTBigCodeModel. Returns a tuple or a BaseModelOutputWithPastAndCrossAttentions object depending on the value of return_dict.

RAISES DESCRIPTION
ValueError

If both input_ids and inputs_embeds are specified.

ValueError

If neither input_ids nor inputs_embeds are specified.

ValueError

If batch_size is less than or equal to 0.

AssertionError

If the encoder_attention_mask has an invalid dimension.

Source code in mindnlp/transformers/models/gpt_bigcode/gpt_bigcode.py
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
def forward(
    self,
    input_ids: Optional[mindspore.Tensor] = None,
    past_key_values: Optional[List[mindspore.Tensor]] = None,
    attention_mask: Optional[mindspore.Tensor] = None,
    token_type_ids: Optional[mindspore.Tensor] = None,
    position_ids: Optional[mindspore.Tensor] = None,
    head_mask: Optional[mindspore.Tensor] = None,
    inputs_embeds: Optional[mindspore.Tensor] = None,
    encoder_hidden_states: Optional[mindspore.Tensor] = None,
    encoder_attention_mask: Optional[mindspore.Tensor] = None,
    use_cache: Optional[bool] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
    """
    Constructs the GPTBigCodeModel.

    Args:
        self (GPTBigCodeModel): The instance of the GPTBigCodeModel class.
        input_ids (Optional[mindspore.Tensor], optional): The input sequence tensor. Defaults to None.
        past_key_values (Optional[List[mindspore.Tensor]], optional):
            List of tensors containing the past key values of the model. Defaults to None.
        attention_mask (Optional[mindspore.Tensor], optional): The attention mask tensor. Defaults to None.
        token_type_ids (Optional[mindspore.Tensor], optional): The token type ids tensor. Defaults to None.
        position_ids (Optional[mindspore.Tensor], optional): The position ids tensor. Defaults to None.
        head_mask (Optional[mindspore.Tensor], optional): The head mask tensor. Defaults to None.
        inputs_embeds (Optional[mindspore.Tensor], optional): The input embeddings tensor. Defaults to None.
        encoder_hidden_states (Optional[mindspore.Tensor], optional): The hidden states of the encoder.
            Defaults to None.
        encoder_attention_mask (Optional[mindspore.Tensor], optional): The attention mask for the encoder.
            Defaults to None.
        use_cache (Optional[bool], optional): Whether to use cache. Defaults to None.
        output_attentions (Optional[bool], optional): Whether to output attentions. Defaults to None.
        output_hidden_states (Optional[bool], optional): Whether to output hidden states. Defaults to None.
        return_dict (Optional[bool], optional): Whether to return a dictionary. Defaults to None.

    Returns:
        Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]: The output of the GPTBigCodeModel.
            Returns a tuple or a BaseModelOutputWithPastAndCrossAttentions object depending on the value of
            return_dict.

    Raises:
        ValueError: If both input_ids and inputs_embeds are specified.
        ValueError: If neither input_ids nor inputs_embeds are specified.
        ValueError: If batch_size is less than or equal to 0.
        AssertionError: If the encoder_attention_mask has an invalid dimension.

    """
    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
    output_hidden_states = (
        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
    )
    use_cache = use_cache if use_cache is not None else self.config.use_cache
    return_dict = return_dict if return_dict is not None else self.config.use_return_dict

    if input_ids is not None and inputs_embeds is not None:
        raise ValueError(
            "You cannot specify both input_ids and inputs_embeds at the same time")

    if input_ids is not None:
        self.warn_if_padding_and_no_attention_mask(
            input_ids, attention_mask)
        input_shape = input_ids.shape
        input_ids = input_ids.view(-1, input_shape[-1])
        batch_size = input_ids.shape[0]
    elif inputs_embeds is not None:
        input_shape = inputs_embeds.shape[:-1]
        batch_size = inputs_embeds.shape[0]
    else:
        raise ValueError(
            "You have to specify either input_ids or inputs_embeds")

    if batch_size <= 0:
        raise ValueError("batch_size has to be defined and > 0")

    if token_type_ids is not None:
        token_type_ids = token_type_ids.view(-1, input_shape[-1])

    if past_key_values is None:
        past_length = 0
        past_key_values = tuple([None] * len(self.h))
    else:
        past_length = past_key_values[0].shape[-2]

    if attention_mask is not None and len(attention_mask.shape) == 2 and position_ids is None:
        # create position_ids on the fly for batch generation
        position_ids = attention_mask.long().cumsum(-1) - 1
        position_ids = position_ids.masked_fill(attention_mask == 0, 1)
        if past_length > 0:
            position_ids = position_ids[:,
                                        past_length: input_shape[-1] + past_length:]
    elif position_ids is None:
        position_ids = ops.arange(
            past_length, input_shape[-1] + past_length, dtype=mindspore.int64)
        position_ids = position_ids.unsqueeze(0)

    # Self-attention mask.
    query_length = input_shape[-1]
    key_length = past_length + query_length
    self_attention_mask = self.bias[None, key_length - query_length: key_length, :key_length]

    if attention_mask is not None:
        self_attention_mask = self_attention_mask * \
            attention_mask.view(batch_size, 1, -1)
        self_attention_mask = self_attention_mask.bool()

    # MQA models: (batch_size, query_length, n_heads, key_length)
    # MHA models: (batch_size, n_heads, query_length, key_length)
    attention_mask = ops.unsqueeze(
        self_attention_mask, 2 if self.multi_query else 1)

    # If a 2D or 3D attention mask is provided for the cross-attention
    # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
    if (
        self.config.add_cross_attention
        and encoder_hidden_states is not None
        and encoder_attention_mask is not None
    ):
        if encoder_attention_mask.dim() == 2:
            encoder_attention_mask.unsqueeze(1)
        assert encoder_attention_mask.dim() == 3
        encoder_attention_mask = encoder_attention_mask.bool(
        ).unsqueeze(2 if self.multi_query else 1)
    else:
        encoder_attention_mask = None

    # Prepare head mask if needed
    # 1.0 in head_mask indicate we keep the head
    # attention_probs has shape bsz x n_heads x N x N
    # head_mask has shape n_layer x batch x n_heads x N x N
    head_mask = self.get_head_mask(head_mask, self.config.n_layer)

    if inputs_embeds is None:
        inputs_embeds = self.wte(input_ids)
    position_embeds = self.wpe(position_ids)
    hidden_states = inputs_embeds + position_embeds

    if token_type_ids is not None:
        token_type_embeds = self.wte(token_type_ids)
        hidden_states = hidden_states + token_type_embeds

    hidden_states = self.drop(hidden_states)
    output_shape = input_shape + (hidden_states.shape[-1],)

    presents = [] if use_cache else None
    all_self_attentions = () if output_attentions else None
    all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
    all_hidden_states = () if output_hidden_states else None
    for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        outputs = block(
            hidden_states,
            layer_past=layer_past,
            attention_mask=attention_mask,
            head_mask=head_mask[i],
            encoder_hidden_states=encoder_hidden_states,
            encoder_attention_mask=encoder_attention_mask,
            use_cache=use_cache,
            output_attentions=output_attentions,
        )

        hidden_states = outputs[0]
        if use_cache:
            presents.append(outputs[1])

        if output_attentions:
            all_self_attentions = all_self_attentions + \
                (outputs[2 if use_cache else 1],)
            if self.config.add_cross_attention:
                all_cross_attentions = all_cross_attentions + \
                    (outputs[3 if use_cache else 2],)

    hidden_states = self.ln_f(hidden_states)

    hidden_states = hidden_states.view(output_shape)
    # Add last hidden state
    if output_hidden_states:
        all_hidden_states = all_hidden_states + (hidden_states,)

    if not return_dict:
        return tuple(
            v
            for v in [hidden_states, presents, all_hidden_states, all_self_attentions, all_cross_attentions]
            if v is not None
        )

    return BaseModelOutputWithPastAndCrossAttentions(
        last_hidden_state=hidden_states,
        past_key_values=presents,
        hidden_states=all_hidden_states,
        attentions=all_self_attentions,
        cross_attentions=all_cross_attentions,
    )

mindnlp.transformers.models.gpt_bigcode.gpt_bigcode.GPTBigCodeModel.get_input_embeddings()

This method returns the input embeddings for the GPTBigCodeModel.

PARAMETER DESCRIPTION
self

The instance of the GPTBigCodeModel class.

RETURNS DESCRIPTION
None

This method returns the input embeddings which are of type None.

Source code in mindnlp/transformers/models/gpt_bigcode/gpt_bigcode.py
685
686
687
688
689
690
691
692
693
694
695
696
697
698
def get_input_embeddings(self):
    """
    This method returns the input embeddings for the GPTBigCodeModel.

    Args:
        self: The instance of the GPTBigCodeModel class.

    Returns:
        None: This method returns the input embeddings which are of type None.

    Raises:
        None.
    """
    return self.wte

mindnlp.transformers.models.gpt_bigcode.gpt_bigcode.GPTBigCodeModel.set_input_embeddings(new_embeddings)

Sets the input embeddings for the GPTBigCodeModel.

PARAMETER DESCRIPTION
self

The instance of the GPTBigCodeModel class.

TYPE: GPTBigCodeModel

new_embeddings

The new input embeddings to be set for the model. It can be of any valid type.

TYPE: object

RETURNS DESCRIPTION

None.

Source code in mindnlp/transformers/models/gpt_bigcode/gpt_bigcode.py
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
def set_input_embeddings(self, new_embeddings):
    """
    Sets the input embeddings for the GPTBigCodeModel.

    Args:
        self (GPTBigCodeModel): The instance of the GPTBigCodeModel class.
        new_embeddings (object): The new input embeddings to be set for the model.
            It can be of any valid type.

    Returns:
        None.

    Raises:
        None.
    """
    self.wte = new_embeddings

mindnlp.transformers.models.gpt_bigcode.gpt_bigcode.GPTBigCodePreTrainedModel

Bases: PreTrainedModel

An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models.

Source code in mindnlp/transformers/models/gpt_bigcode/gpt_bigcode.py
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
class GPTBigCodePreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """
    config_class = GPTBigCodeConfig
    base_model_prefix = "transformer"
    supports_gradient_checkpointing = True
    _no_split_modules = ["GPTBigCodeBlock"]
    _skip_keys_device_placement = "past_key_values"

    def _init_weights(self, cell):
        """Initialize the weights."""
        if isinstance(cell, (GPTBigCodeMLP, GPTBigCodeAttention)):
            sigma = self.config.initializer_range / math.sqrt(2 * self.config.n_layer)
            cell.c_proj.weight.set_data(initializer(Normal(sigma=sigma),cell.c_proj.weight.shape, cell.c_proj.weight.dtype))
        if isinstance(cell, nn.Linear):
            cell.weight.set_data(initializer(Normal(sigma=self.config.initializer_range),
                                             cell.weight.shape, cell.weight.dtype))
            if cell.bias:
                cell.bias.set_data(initializer(
                    'zeros', cell.bias.shape, cell.bias.dtype))
        elif isinstance(cell, nn.Embedding):
            weight = initializer(Normal(sigma=self.config.initializer_range),
                                          cell.weight.shape,
                                          cell.weight.dtype)
            if cell.padding_idx is not None:
                weight[cell.padding_idx] = 0
            cell.weight.set_data(weight)
        elif isinstance(cell, nn.LayerNorm):
            cell.weight.set_data(initializer(
                'ones', cell.weight.shape, cell.weight.dtype))
            cell.bias.set_data(initializer(
                'zeros', cell.bias.shape, cell.bias.dtype))

    def _set_gradient_checkpointing(self, module, value=False):
        """
        Set the gradient checkpointing for a given module in the GPTBigCodePreTrainedModel.

        Args:
            self (GPTBigCodePreTrainedModel): The instance of the GPTBigCodePreTrainedModel.
            module (object): The module for which the gradient checkpointing needs to be set.
            value (bool): The boolean value indicating whether to enable gradient checkpointing for the module.

        Returns:
            None.

        Raises:
            TypeError: If the module is not an instance of GPTBigCodeModel.
        """
        if isinstance(module, GPTBigCodeModel):
            module.gradient_checkpointing = value

    def _set_gradient_checkpointing(self, module, value=False):
        """
        Sets the gradient checkpointing flag for a specific module in a GPTBigCodePreTrainedModel instance.

        Args:
            self (GPTBigCodePreTrainedModel): The GPTBigCodePreTrainedModel instance.
            module (GPTBigCodeModel): The module for which to set the gradient checkpointing flag.
            value (bool): The value to set the gradient checkpointing flag to. Default is False.

        Returns:
            None.

        Raises:
            None.

        Note:
            Gradient checkpointing is a memory optimization technique used during training of deep neural networks.
            When the gradient checkpointing flag is set to True for a specific module, intermediate activations are not
            stored during the forward pass, which reduces the memory usage at the cost of recomputing those activations
            during the backward pass. This can be useful for models with large memory requirements.
        """
        if isinstance(module, GPTBigCodeModel):
            module.gradient_checkpointing = value

    def _backward_compatibility_gradient_checkpointing(self):
        """
        Support gradient_checkpointing.
        """
        if self.supports_gradient_checkpointing and getattr(self.config, "gradient_checkpointing", False):
            self.gradient_checkpointing_enable()
            # Remove the attribute now that is has been consumed, so it's no saved in the config.
            delattr(self.config, "gradient_checkpointing")

    def gradient_checkpointing_enable(self):
        """
        Activates gradient checkpointing for the current model.
        Note that in other frameworks this feature can be referred to as "activation checkpointing" or "checkpoint
        activations".
        """
        if not self.supports_gradient_checkpointing:
            raise ValueError(
                f"{self.__class__.__name__} does not support gradient checkpointing.")
        self.apply(partial(self._set_gradient_checkpointing, value=True))

mindnlp.transformers.models.gpt_bigcode.gpt_bigcode.GPTBigCodePreTrainedModel.gradient_checkpointing_enable()

Activates gradient checkpointing for the current model. Note that in other frameworks this feature can be referred to as "activation checkpointing" or "checkpoint activations".

Source code in mindnlp/transformers/models/gpt_bigcode/gpt_bigcode.py
617
618
619
620
621
622
623
624
625
626
def gradient_checkpointing_enable(self):
    """
    Activates gradient checkpointing for the current model.
    Note that in other frameworks this feature can be referred to as "activation checkpointing" or "checkpoint
    activations".
    """
    if not self.supports_gradient_checkpointing:
        raise ValueError(
            f"{self.__class__.__name__} does not support gradient checkpointing.")
    self.apply(partial(self._set_gradient_checkpointing, value=True))

mindnlp.transformers.models.gpt_bigcode.gpt_bigcode.masked_softmax(input_x, mask, mask_value)

Fuse kernel for masked softmax.

Source code in mindnlp/transformers/models/gpt_bigcode/gpt_bigcode.py
63
64
65
66
67
def masked_softmax(input_x: mindspore.Tensor, mask: mindspore.Tensor, mask_value: mindspore.Tensor):
    """Fuse kernel for masked softmax."""
    input_x = ops.where(mask, input_x, mask_value)
    input_x = ops.softmax(input_x, axis=-1)
    return input_x

mindnlp.transformers.models.gpt_bigcode.gpt_bigcode.upcast_masked_softmax(input_x, mask, mask_value, scale, softmax_dtype)

Fuse kernel for upcast masked softmax.

Source code in mindnlp/transformers/models/gpt_bigcode/gpt_bigcode.py
44
45
46
47
48
49
50
51
52
def upcast_masked_softmax(
    input_x: mindspore.Tensor, mask: mindspore.Tensor, mask_value: mindspore.Tensor, scale: float, softmax_dtype: mindspore.dtype
):
    """Fuse kernel for upcast masked softmax."""
    input_dtype = input_x.dtype
    input_x = input_x.to(softmax_dtype) * scale
    input_x = ops.where(mask, input_x, mask_value)
    input_x = ops.softmax(input_x, axis=-1).to(input_dtype)
    return input_x

mindnlp.transformers.models.gpt_bigcode.gpt_bigcode.upcast_softmax(input_x, scale, softmax_dtype)

Fuse kernel for upcast softmax.

Source code in mindnlp/transformers/models/gpt_bigcode/gpt_bigcode.py
55
56
57
58
59
60
def upcast_softmax(input_x: mindspore.Tensor, scale: float, softmax_dtype: mindspore.dtype):
    """Fuse kernel for upcast softmax."""
    input_dtype = input_x.dtype
    input_x = input_x.to(softmax_dtype) * scale
    input_x = ops.softmax(input_x, axis=-1).to(input_dtype)
    return input_x

mindnlp.transformers.models.gpt_bigcode.gpt_bigcode_config

MindNLP gpt_bigcode config

mindnlp.transformers.models.gpt_bigcode.gpt_bigcode_config.GPTBigCodeConfig

Bases: PretrainedConfig

GPT BigCode config

Source code in mindnlp/transformers/models/gpt_bigcode/gpt_bigcode_config.py
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
class GPTBigCodeConfig(PretrainedConfig):
    r"""
    GPT BigCode config
    """
    model_type = "gpt_bigcode"
    keys_to_ignore_at_inference = ["past_key_values"]
    attribute_map = {
        "hidden_size": "n_embd",
        "max_position_embeddings": "n_positions",
        "num_attention_heads": "n_head",
        "num_hidden_layers": "n_layer",
    }

    def __init__(
        self,
        vocab_size=50257,
        n_positions=1024,
        n_embd=768,
        n_layer=12,
        n_head=12,
        n_inner=None,
        activation_function="gelu_approximate",
        resid_pdrop=0.1,
        embd_pdrop=0.1,
        attn_pdrop=0.1,
        layer_norm_epsilon=1e-5,
        initializer_range=0.02,
        scale_attn_weights=True,
        use_cache=True,
        bos_token_id=50256,
        eos_token_id=50256,
        attention_softmax_in_fp32=True,
        scale_attention_softmax_in_fp32=True,
        multi_query=True,
        **kwargs,
    ):
        """
        __init__

        Initialize a new GPTBigCodeConfig object.

        Args:
            vocab_size (int, optional): The size of the vocabulary. Default is 50257.
            n_positions (int, optional): The maximum sequence length for the model. Default is 1024.
            n_embd (int, optional): The dimension of the embeddings and hidden states. Default is 768.
            n_layer (int, optional): The number of layers in the model. Default is 12.
            n_head (int, optional): The number of attention heads in the model. Default is 12.
            n_inner (int, optional): The inner dimension of the feedforward layers. Default is None.
            activation_function (str, optional): The activation function used in the model. Default is 'gelu_approximate'.
            resid_pdrop (float, optional): The dropout probability for residual connections. Default is 0.1.
            embd_pdrop (float, optional): The dropout probability for embeddings. Default is 0.1.
            attn_pdrop (float, optional): The dropout probability for attention layers. Default is 0.1.
            layer_norm_epsilon (float, optional): The epsilon value for layer normalization. Default is 1e-05.
            initializer_range (float, optional): The range for parameter initializers. Default is 0.02.
            scale_attn_weights (bool, optional): Whether to scale the attention weights. Default is True.
            use_cache (bool, optional): Whether to use caching during inference. Default is True.
            bos_token_id (int, optional): The token id for the beginning of sequence. Default is 50256.
            eos_token_id (int, optional): The token id for the end of sequence. Default is 50256.
            attention_softmax_in_fp32 (bool, optional): Whether to use fp32 for attention softmax. Default is True.
            scale_attention_softmax_in_fp32 (bool, optional): Whether to scale attention softmax in fp32. Default is True.
            multi_query (bool, optional): Whether to use multi-query attention. Default is True.

        Returns:
            None.

        Raises:
            None.
        """
        self.vocab_size = vocab_size
        self.n_positions = n_positions
        self.n_embd = n_embd
        self.n_layer = n_layer
        self.n_head = n_head
        self.n_inner = n_inner
        self.activation_function = activation_function
        self.resid_pdrop = resid_pdrop
        self.embd_pdrop = embd_pdrop
        self.attn_pdrop = attn_pdrop
        self.layer_norm_epsilon = layer_norm_epsilon
        self.initializer_range = initializer_range
        self.scale_attn_weights = scale_attn_weights
        self.use_cache = use_cache
        self.attention_softmax_in_fp32 = attention_softmax_in_fp32
        self.scale_attention_softmax_in_fp32 = scale_attention_softmax_in_fp32
        self.multi_query = multi_query

        self.bos_token_id = bos_token_id
        self.eos_token_id = eos_token_id

        super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)

mindnlp.transformers.models.gpt_bigcode.gpt_bigcode_config.GPTBigCodeConfig.__init__(vocab_size=50257, n_positions=1024, n_embd=768, n_layer=12, n_head=12, n_inner=None, activation_function='gelu_approximate', resid_pdrop=0.1, embd_pdrop=0.1, attn_pdrop=0.1, layer_norm_epsilon=1e-05, initializer_range=0.02, scale_attn_weights=True, use_cache=True, bos_token_id=50256, eos_token_id=50256, attention_softmax_in_fp32=True, scale_attention_softmax_in_fp32=True, multi_query=True, **kwargs)

init

Initialize a new GPTBigCodeConfig object.

PARAMETER DESCRIPTION
vocab_size

The size of the vocabulary. Default is 50257.

TYPE: int DEFAULT: 50257

n_positions

The maximum sequence length for the model. Default is 1024.

TYPE: int DEFAULT: 1024

n_embd

The dimension of the embeddings and hidden states. Default is 768.

TYPE: int DEFAULT: 768

n_layer

The number of layers in the model. Default is 12.

TYPE: int DEFAULT: 12

n_head

The number of attention heads in the model. Default is 12.

TYPE: int DEFAULT: 12

n_inner

The inner dimension of the feedforward layers. Default is None.

TYPE: int DEFAULT: None

activation_function

The activation function used in the model. Default is 'gelu_approximate'.

TYPE: str DEFAULT: 'gelu_approximate'

resid_pdrop

The dropout probability for residual connections. Default is 0.1.

TYPE: float DEFAULT: 0.1

embd_pdrop

The dropout probability for embeddings. Default is 0.1.

TYPE: float DEFAULT: 0.1

attn_pdrop

The dropout probability for attention layers. Default is 0.1.

TYPE: float DEFAULT: 0.1

layer_norm_epsilon

The epsilon value for layer normalization. Default is 1e-05.

TYPE: float DEFAULT: 1e-05

initializer_range

The range for parameter initializers. Default is 0.02.

TYPE: float DEFAULT: 0.02

scale_attn_weights

Whether to scale the attention weights. Default is True.

TYPE: bool DEFAULT: True

use_cache

Whether to use caching during inference. Default is True.

TYPE: bool DEFAULT: True

bos_token_id

The token id for the beginning of sequence. Default is 50256.

TYPE: int DEFAULT: 50256

eos_token_id

The token id for the end of sequence. Default is 50256.

TYPE: int DEFAULT: 50256

attention_softmax_in_fp32

Whether to use fp32 for attention softmax. Default is True.

TYPE: bool DEFAULT: True

scale_attention_softmax_in_fp32

Whether to scale attention softmax in fp32. Default is True.

TYPE: bool DEFAULT: True

multi_query

Whether to use multi-query attention. Default is True.

TYPE: bool DEFAULT: True

RETURNS DESCRIPTION

None.

Source code in mindnlp/transformers/models/gpt_bigcode/gpt_bigcode_config.py
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
def __init__(
    self,
    vocab_size=50257,
    n_positions=1024,
    n_embd=768,
    n_layer=12,
    n_head=12,
    n_inner=None,
    activation_function="gelu_approximate",
    resid_pdrop=0.1,
    embd_pdrop=0.1,
    attn_pdrop=0.1,
    layer_norm_epsilon=1e-5,
    initializer_range=0.02,
    scale_attn_weights=True,
    use_cache=True,
    bos_token_id=50256,
    eos_token_id=50256,
    attention_softmax_in_fp32=True,
    scale_attention_softmax_in_fp32=True,
    multi_query=True,
    **kwargs,
):
    """
    __init__

    Initialize a new GPTBigCodeConfig object.

    Args:
        vocab_size (int, optional): The size of the vocabulary. Default is 50257.
        n_positions (int, optional): The maximum sequence length for the model. Default is 1024.
        n_embd (int, optional): The dimension of the embeddings and hidden states. Default is 768.
        n_layer (int, optional): The number of layers in the model. Default is 12.
        n_head (int, optional): The number of attention heads in the model. Default is 12.
        n_inner (int, optional): The inner dimension of the feedforward layers. Default is None.
        activation_function (str, optional): The activation function used in the model. Default is 'gelu_approximate'.
        resid_pdrop (float, optional): The dropout probability for residual connections. Default is 0.1.
        embd_pdrop (float, optional): The dropout probability for embeddings. Default is 0.1.
        attn_pdrop (float, optional): The dropout probability for attention layers. Default is 0.1.
        layer_norm_epsilon (float, optional): The epsilon value for layer normalization. Default is 1e-05.
        initializer_range (float, optional): The range for parameter initializers. Default is 0.02.
        scale_attn_weights (bool, optional): Whether to scale the attention weights. Default is True.
        use_cache (bool, optional): Whether to use caching during inference. Default is True.
        bos_token_id (int, optional): The token id for the beginning of sequence. Default is 50256.
        eos_token_id (int, optional): The token id for the end of sequence. Default is 50256.
        attention_softmax_in_fp32 (bool, optional): Whether to use fp32 for attention softmax. Default is True.
        scale_attention_softmax_in_fp32 (bool, optional): Whether to scale attention softmax in fp32. Default is True.
        multi_query (bool, optional): Whether to use multi-query attention. Default is True.

    Returns:
        None.

    Raises:
        None.
    """
    self.vocab_size = vocab_size
    self.n_positions = n_positions
    self.n_embd = n_embd
    self.n_layer = n_layer
    self.n_head = n_head
    self.n_inner = n_inner
    self.activation_function = activation_function
    self.resid_pdrop = resid_pdrop
    self.embd_pdrop = embd_pdrop
    self.attn_pdrop = attn_pdrop
    self.layer_norm_epsilon = layer_norm_epsilon
    self.initializer_range = initializer_range
    self.scale_attn_weights = scale_attn_weights
    self.use_cache = use_cache
    self.attention_softmax_in_fp32 = attention_softmax_in_fp32
    self.scale_attention_softmax_in_fp32 = scale_attention_softmax_in_fp32
    self.multi_query = multi_query

    self.bos_token_id = bos_token_id
    self.eos_token_id = eos_token_id

    super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)

mindnlp.transformers.models.gpt_bigcode.gpt_bigcode_tokenizer

GPT2Tokenizer

mindnlp.transformers.models.gpt_bigcode.gpt_bigcode_tokenizer.GPTBigCodeTokenizer

Bases: PreTrainedTokenizer

Tokenizer used for GPT2 text process.

PARAMETER DESCRIPTION
vocab

Vocabulary used to look up words.

TYPE: Vocab

return_token

Whether to return token. If True: return tokens. False: return ids. Default: True.

TYPE: bool

Source code in mindnlp/transformers/models/gpt_bigcode/gpt_bigcode_tokenizer.py
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
class GPTBigCodeTokenizer(PreTrainedTokenizer):
    """
        Tokenizer used for GPT2 text process.

        Args:
            vocab (Vocab): Vocabulary used to look up words.
            return_token (bool): Whether to return token. If True: return tokens. False: return ids. Default: True.

        """
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES

    def __init__(
        self,
        tokenizer_file=None,
        unk_token="<|endoftext|>",
        bos_token="<|endoftext|>",
        eos_token="<|endoftext|>",
        add_prefix_space=False,
        **kwargs
    ):
        """
        Initializes a new instance of the GPTBigCodeTokenizer class.

        Args:
            self (GPTBigCodeTokenizer): The instance of the class itself.
            tokenizer_file (str): The file path of the tokenizer file to be used. Only string values are supported.
            unk_token (str): The token to represent unknown words. Default is 'endoftext'.
            bos_token (str): The token to represent the beginning of a sentence. Default is 'endoftext'.
            eos_token (str): The token to represent the end of a sentence. Default is 'endoftext'.
            add_prefix_space (bool): Whether to add a prefix space before the input text. Default is False.
            **kwargs: Additional keyword arguments.

        Returns:
            None.

        Raises:
            ValueError: If the tokenizer_file is not of type string.

        """
        super().__init__(
            unk_token=unk_token,
            bos_token=bos_token,
            eos_token=eos_token,
            add_prefix_space=add_prefix_space,
            **kwargs)

        return_token = kwargs.pop('return_token', False)

        if isinstance(tokenizer_file, str):
            self._tokenizer = Tokenizer.from_file(tokenizer_file)
        else:
            raise ValueError(f'only support string, but got {tokenizer_file}')

        self.return_token = return_token
        self.implementation = Implementation.PY

    def execute_py(self, text_input):
        """
        Execute method.
        """
        return self._execute_py(text_input)

    def _execute_py(self, text_input):
        """
        Execute method.
        """
        text_input = self._convert_to_unicode(text_input)
        tokens = self._tokenizer.encode(text_input)
        if self.return_token is True:
            return np.array(tokens.tokens)
        return {"input_ids": Tensor(np.array(tokens.ids)), "attention_mask": Tensor(np.array(tokens.attention_mask))}

    def _convert_to_unicode(self, text_input):
        """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
        if isinstance(text_input, str):
            return text_input
        if isinstance(text_input, bytes):
            return text_input.decode("utf-8", "ignore")
        if isinstance(text_input, np.ndarray):
            if text_input.dtype.type is np.bytes_:
                text_input = np.char.decode(text_input, "utf-8")
            return str(text_input)
        raise ValueError(
            f"Unsupported string type: {type(text_input)}, {text_input.dtype}")

    def _convert_token_to_id(self, token):
        """
        Converts the given token to its corresponding ID using the GPTBigCodeTokenizer.

        Args:
            self (GPTBigCodeTokenizer): An instance of the GPTBigCodeTokenizer class.
            token (str): The token to be converted to ID.

        Returns:
            int: The ID corresponding to the given token. Returns self.unk_token_id if the token is not found.

        Raises:
            None.
        """
        index = self._tokenizer.token_to_id(token)
        if index is None:
            return self.unk_token_id
        return index

mindnlp.transformers.models.gpt_bigcode.gpt_bigcode_tokenizer.GPTBigCodeTokenizer.__init__(tokenizer_file=None, unk_token='<|endoftext|>', bos_token='<|endoftext|>', eos_token='<|endoftext|>', add_prefix_space=False, **kwargs)

Initializes a new instance of the GPTBigCodeTokenizer class.

PARAMETER DESCRIPTION
self

The instance of the class itself.

TYPE: GPTBigCodeTokenizer

tokenizer_file

The file path of the tokenizer file to be used. Only string values are supported.

TYPE: str DEFAULT: None

unk_token

The token to represent unknown words. Default is 'endoftext'.

TYPE: str DEFAULT: '<|endoftext|>'

bos_token

The token to represent the beginning of a sentence. Default is 'endoftext'.

TYPE: str DEFAULT: '<|endoftext|>'

eos_token

The token to represent the end of a sentence. Default is 'endoftext'.

TYPE: str DEFAULT: '<|endoftext|>'

add_prefix_space

Whether to add a prefix space before the input text. Default is False.

TYPE: bool DEFAULT: False

**kwargs

Additional keyword arguments.

DEFAULT: {}

RETURNS DESCRIPTION

None.

RAISES DESCRIPTION
ValueError

If the tokenizer_file is not of type string.

Source code in mindnlp/transformers/models/gpt_bigcode/gpt_bigcode_tokenizer.py
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
def __init__(
    self,
    tokenizer_file=None,
    unk_token="<|endoftext|>",
    bos_token="<|endoftext|>",
    eos_token="<|endoftext|>",
    add_prefix_space=False,
    **kwargs
):
    """
    Initializes a new instance of the GPTBigCodeTokenizer class.

    Args:
        self (GPTBigCodeTokenizer): The instance of the class itself.
        tokenizer_file (str): The file path of the tokenizer file to be used. Only string values are supported.
        unk_token (str): The token to represent unknown words. Default is 'endoftext'.
        bos_token (str): The token to represent the beginning of a sentence. Default is 'endoftext'.
        eos_token (str): The token to represent the end of a sentence. Default is 'endoftext'.
        add_prefix_space (bool): Whether to add a prefix space before the input text. Default is False.
        **kwargs: Additional keyword arguments.

    Returns:
        None.

    Raises:
        ValueError: If the tokenizer_file is not of type string.

    """
    super().__init__(
        unk_token=unk_token,
        bos_token=bos_token,
        eos_token=eos_token,
        add_prefix_space=add_prefix_space,
        **kwargs)

    return_token = kwargs.pop('return_token', False)

    if isinstance(tokenizer_file, str):
        self._tokenizer = Tokenizer.from_file(tokenizer_file)
    else:
        raise ValueError(f'only support string, but got {tokenizer_file}')

    self.return_token = return_token
    self.implementation = Implementation.PY

mindnlp.transformers.models.gpt_bigcode.gpt_bigcode_tokenizer.GPTBigCodeTokenizer.execute_py(text_input)

Execute method.

Source code in mindnlp/transformers/models/gpt_bigcode/gpt_bigcode_tokenizer.py
86
87
88
89
90
def execute_py(self, text_input):
    """
    Execute method.
    """
    return self._execute_py(text_input)