Source code for aisquared.utils.utils

from typing import Union
from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin, TransformerMixin
from sklearn.metrics import classification_report, confusion_matrix, mean_squared_error
try:
    import tensorflow as tf
except ImportError:
    pass
import numpy as np
import beyondml


def _print_report(true_data, orig_preds, mimic_preds, problem_type):
    print('ORIGINAL PERFORMANCE:')
    if problem_type == 'classification':
        print(confusion_matrix(true_data, orig_preds))
        print(classification_report(true_data, orig_preds))
    else:
        print(mean_squared_error(true_data, orig_preds, squared=False))
    print('\n\n')

    print('MIMIC PERFORMANCE:')
    print('\n')
    print('Relative to First Model:')
    if problem_type == 'classification':
        if mimic_preds.shape[1] == 1:
            print(confusion_matrix(orig_preds, (mimic_preds >= 0.5).astype(int)))
            print(classification_report(orig_preds,
                  (mimic_preds >= 0.5).astype(int)))
        else:
            print(confusion_matrix(orig_preds, mimic_preds.argmax(axis=1)))
            print(classification_report(orig_preds, mimic_preds.argmax(axis=1)))
    else:
        print(mean_squared_error(orig_preds, mimic_preds, squared=False))
        print(f'Standard deviation: {np.std(mimic_preds - orig_preds)}')
    print('\n')

    print('Relative to Original:')
    if problem_type == 'classification':
        if mimic_preds.shape[1] == 1:
            print(confusion_matrix(true_data, (mimic_preds >= 0.5).astype(int)))
            print(classification_report(
                true_data, (mimic_preds >= 0.5).astype(int)))
        else:
            print(confusion_matrix(true_data, mimic_preds.argmax(axis=1)))
            print(classification_report(true_data, mimic_preds.argmax(axis=1)))
    else:
        print(mean_squared_error(true_data, mimic_preds, squared=False))
        print(f'Standard deviation: {np.std(true_data - orig_preds)}')


def _get_cv_model(size, input_shape, num_outputs, output_activation):
    if size == 'small':
        num_blocks = 2
        num_hidden = 1
        hidden_size = 64
    elif size == 'medium':
        num_blocks = 4
        num_hidden = 2
        hidden_size = 128
    elif size == 'large':
        num_blocks = 6
        num_hidden = 4
        hidden_size = 256
    else:
        raise ValueError(
            f'size must be one of "small", "medium", "large", got {size}')

    input_layer = tf.keras.layers.Input(input_shape)
    x = tf.keras.layers.Conv2D(
        8, 3, padding='same', activation='relu')(input_layer)
    x = tf.keras.layers.Conv2D(8, 3, padding='same', activation='relu')(x)
    x = tf.keras.layers.MaxPool2D(strides=1)(x)

    for block_num in range(num_blocks - 1):
        x = tf.keras.layers.Conv2D(
            8 * (2 ** (block_num + 1)), 3, padding='same', activation='relu')(x)
        x = tf.keras.layers.Conv2D(
            8 * (2 ** (block_num + 1)), 3, padding='same', activation='relu')(x)
        x = tf.keras.layers.MaxPool2D(strides=1)(x)

    x = tf.keras.layers.Flatten()(x)
    for _ in range(num_hidden):
        x = tf.keras.layers.Dense(hidden_size, activation='relu')(x)

    output_layer = tf.keras.layers.Dense(
        num_outputs, activation=output_activation)(x)

    return tf.keras.models.Model(input_layer, output_layer)


def _get_embedding_model(size, vocab_size, input_shape, num_outputs, output_activation):

    if vocab_size is None:
        raise ValueError(
            'If NLP embedding model specified, must also specify vocab_size')

    if size == 'small':
        embedding_dim = 4
        num_hidden = 4
        hidden_size = 64
    elif size == 'medium':
        embedding_dim = 8
        num_hidden = 6
        hidden_size = 128
    elif size == 'large':
        embedding_dim = 16
        num_hidden = 8
        hidden_size = 256
    else:
        raise ValueError(
            f'size must be one of "small", "medium", "large", got {size}')

    input_layer = tf.keras.layers.Input(input_shape)
    x = tf.keras.layers.Embedding(vocab_size, embedding_dim)(input_layer)
    x = tf.keras.layers.Flatten()(x)
    for _ in range(num_hidden):
        x = tf.keras.layers.Dense(hidden_size, activation='relu')(x)
    output_layer = tf.keras.layers.Dense(
        num_outputs, activation=output_activation)(x)

    return tf.keras.models.Model(input_layer, output_layer)


def _get_fc_model(size, input_shape, num_outputs, output_activation):
    if size == 'small':
        num_hidden = 4
        hidden_size = 64
    elif size == 'medium':
        num_hidden = 6
        hidden_size = 128
    elif size == 'large':
        num_hidden = 8
        hidden_size = 256
    else:
        raise ValueError(
            f'size must be one of "small", "medium", "large", got {size}')

    input_layer = tf.keras.layers.Input(input_shape)
    for i in range(num_hidden):
        if i == 0:
            x = tf.keras.layers.Dense(
                hidden_size, activation='relu')(input_layer)
        else:
            x = tf.keras.layers.Dense(hidden_size, activation='relu')(x)
    output_layer = tf.keras.layers.Dense(
        num_outputs, activation=output_activation)(x)

    return tf.keras.models.Model(input_layer, output_layer)


[docs]def mimic_model(
    trained_model: BaseEstimator,
    nnet: tf.keras.models.Model,
    training_data: np.ndarray,
    test_data: np.ndarray,
    test_labels: np.ndarray,
    problem_type: str,
    loss: str,
    metrics: Union[str, list],
    optimizer: str,
    mimic_proba: bool = False,
    retention: float = 0.9,
    batch_size: int = 32,
    epochs: int = 100,
    starting_sparsification: int = 0,
    max_sparsification: int = 99,
    sparsification_rate: int = 5
) -> tf.keras.models.Model:
    """
    Train a sparse neural network to mimic a scikit-learn model

    Parameters
    ----------
    trained_model : sklearn model
        The model that is already trained
    nnet : TensorFlow keras Model
        The neural network to train to mimic the trained model
    training_data : array or array-like
        The input data that was used to train the trained model
    test_data : array or array-like
        The input data to be used for testing
    test_labels : array or array-like
        The output data used in testing
    problem_type : str
        The type of problem, either 'classification' or 'regression'
    loss : str or keras loss function
        The loss to use
    metrics : str, function or list of str, function
        Metrics to measure
    optimizer : str or keras optimizer
        The optimizer to use
    mimic_proba : bool (default False)
        For classification, mimic the probability outputs
    retention : float (default 0.9)
        The retention of performance to allow further pruning
    batch_size : int (default 32)
        The batch size to use while training
    epochs : int (default 100)
        The number of epochs (if early stopping is not met beforehand)
    starting_sparsification : int (default 0)
        The starting model sparsification
    max_sparsification : int (default 99)
        The maximum sparsification to allow
    sparsification_rate : int (default 5)
        The sparsification rate when invoked

    Returns
    -------
    nnet : TensorFlow keras Model
        The trained model
    """

    # Check problem type
    if problem_type not in ['classification', 'regression']:
        raise ValueError(
            'problem_type must be "classification" or "regression"')

    # Check that the model to mimic is a sklearn model
    if not isinstance(trained_model, BaseEstimator):
        raise TypeError(
            'Currently can only convert trained scikit-learn models')

    # Get the predictions
    if isinstance(trained_model, TransformerMixin):
        training_predictions = trained_model.transform(training_data)
        testing_predictions = trained_model.transform(test_data)
    elif isinstance(trained_model, (ClassifierMixin, RegressorMixin)):
        if isinstance(trained_model, ClassifierMixin) and mimic_proba:
            training_predictions = trained_model.predict_proba(training_data)
            testing_predictions = trained_model.predict(test_data)
        else:
            training_predictions = trained_model.predict(training_data)
            testing_predictions = trained_model.predict(test_data)
    else:
        raise TypeError(
            'trained_model is not a scikit-learn TransformerMixin, ClassifierMixin, or RegressorMixin')

    if problem_type == 'classification':
        cutoff = retention
    else:
        cutoff = mean_squared_error(
            test_labels, testing_predictions, squared=False) / retention

    # Add layer masks
    nnet = beyondml.tflow.utils.add_layer_masks(nnet)
    nnet = beyondml.tflow.utils.train_model(
        nnet,
        training_data,
        training_predictions,
        loss,
        metrics,
        optimizer,
        cutoff,
        batch_size,
        epochs,
        starting_sparsification,
        max_sparsification,
        sparsification_rate
    )

    nnet = beyondml.tflow.utils.remove_layer_masks(nnet)
    nnet_preds = nnet.predict(test_data)
    _print_report(test_labels, testing_predictions, nnet_preds, problem_type)
    return nnet


[docs]def get_model(
    model_type: str,
    input_shape: Union[int, tuple],
    num_outputs: int,
    output_activation: str,
    size: str = 'small',
    vocab_size: Union[None, int] = None
):
    """
    Get a pre-configured model for different use cases

    Parameters
    ----------
    model_type : str
        Either 'cv', 'nlp_embedding', or 'fc', defining the model type
    input_shape : int or tuple of int
        The input shape to the model
    num_outputs : int
        The output shape of the model
    output_activation : str or keras activation function
        The activation of the final layer of the model
    size : str (default 'small')
        One of either 'small', 'medium', or 'large'
    vocab_size : str or None (default None)
        Size of the vocab, if model_type is 'nlp_embedding'

    Returns
    -------
    model : TensorFlow Keras model
        The model
    """
    if model_type == 'cv':
        return _get_cv_model(size, input_shape, num_outputs, output_activation)
    elif model_type == 'nlp_embedding':
        return _get_embedding_model(size, vocab_size, input_shape, num_outputs, output_activation)
    elif model_type == 'fc':
        return _get_fc_model(size, input_shape, num_outputs, output_activation)
    else:
        raise ValueError(
            f'model_type must be one of "cv", "nlp_embedding", or "fc", got {model_type}')