Source code for aisquared.config.preprocessing.tabular.Steps

from typing import Union
from aisquared.base import BaseObject


[docs]class ZScore(BaseObject): """ Z-Score normalization preprocessing step Z-Score normalization takes each supplied column value, subtracts that column's provided mean, and divides by the provided standard deviation. Example usage: >>> import aisquared >>> preprocesser = aisquared.config.preprocessing.tabular.TabularPreprocesser() >>> preprocesser.add_step( aisquared.config.preprocessing.tabular.ZScore( [0, 1, 2], [0.2, 0.4, 0.6] ) ) """ def __init__( self, means: list, stds: list, columns: Union[int, list] = None ): """ means : list List of integer or float values which are the means of the associated columns stds : list List of integer or float values which are the standard deviations of the associated columns columns : None or list (default None) If provided, is a list of column indexes to apply normalization to """ super().__init__() self.means = means self.stds = stds self.columns = columns if len(self.means) != len(self.stds): raise ValueError('means and stds must have the same length') if self.columns is not None: if len(self.columns) != len(self.means): raise ValueError( 'Number of columns must match number of means and stds') @property def means(self): return self._means @means.setter def means(self, value): if not isinstance(value, list): raise TypeError('means must be a list') if not all([isinstance(val, (int, float)) for val in value]): raise TypeError('Each value in means must be int or float') self._means = value @property def stds(self): return self._stds @stds.setter def stds(self, value): if not isinstance(value, list): raise TypeError('stds must be a list') if not all([isinstance(val, (int, float)) for val in value]): raise TypeError('Each value in stds must be int or float') self._stds = value @property def columns(self): return self._columns @columns.setter def columns(self, value): if not isinstance(value, list) and value is not None: raise TypeError('If provided, columns must be list') if isinstance(value, list) and not all([isinstance(val, int) for val in value]): raise TypeError('Each value of columns must be an int') self._columns = value
[docs] def to_dict(self) -> dict: """ Get the configuration object as a dictionary """ return { 'className': 'ZScore', 'params': { 'means': self.means, 'stds': self.stds, 'columns': self.columns } }
[docs]class MinMax(BaseObject): """ Min-Max Scaling preprocessing step Min-Max Scaling takes all associated columns and maps values relative to the minimum and maximum values of the training data. Example usage: >>> import aisquared >>> preprocesser = aisquared.config.preprocessing.tabular.TabularPreprocesser() >>> preprocesser.add_step( aisquared.config.preprocessing.tabular.MinMax( [0, 1.1, 2], [0.2, 14, 18.3] ) ) """ def __init__( self, mins: list, maxs: list, columns: list = None ): """ Parameters ---------- mins : list List of integers or floats associated with the minimum values of each column in the training data maxs : list List of integers or floats associated with the maximum values of each column in the training data columns : None or list (default None) If provided, a list of column indexes to apply scaling to """ super().__init__() self.mins = mins self.maxs = maxs self.columns = columns if len(self.mins) != len(self.maxs): raise ValueError('Length of mins and maxs must equal') if self.columns is not None: if len(self.mins) != len(self.columns): raise ValueError( 'Number of mins and maxs must equal the number of columns') @property def mins(self): return self._mins @mins.setter def mins(self, value): if not isinstance(value, list): raise TypeError('mins must be a list') if not all([isinstance(val, (int, float)) for val in value]): raise TypeError('Each value in mins must be int or float') self._mins = value @property def maxs(self): return self._maxs @maxs.setter def maxs(self, value): if not isinstance(value, list): raise TypeError('maxs must be a list') if not all([isinstance(val, (int, float)) for val in value]): raise TypeError('Each value in maxs must be int or float') self._maxs = value @property def columns(self): return self._columns @columns.setter def columns(self, value): if not isinstance(value, list) and value is not None: raise TypeError('If passed, columns must be list') if value is not None: if not all([isinstance(val, int) for val in value]): raise TypeError( 'If passed, each value in columns must be an int') self._columns = value
[docs] def to_dict(self) -> dict: """ Get the configuration object as a dictionary """ return { 'className': 'MinMax', 'params': { 'mins': self.mins, 'maxs': self.maxs, 'columns': self.columns } }
[docs]class OneHot(BaseObject): """ One Hot encoding preprocessing step Example usage: >>> import aisquared >>> preprocesser = aisquared.config.preprocessing.tabular.TabularPreprocesser() >>> preprocesser.add_step( aisquared.config.preprocessing.tabular.OneHot( 6, ['one', 'two', 'three'] ) ) """ def __init__( self, column: int, values: list ): """ Parameters ---------- column : int Integer index of the column to apply one hot encoding to values : list The values, in order, to create binary columns for. Note that if a default value is intended, that value should simply not be provided in this list """ super().__init__() self.column = column self.values = values @property def column(self): return self._column @column.setter def column(self, value): if not isinstance(value, int): raise TypeError('column must be integer') self._column = value @property def values(self): return self._values @values.setter def values(self, value): if not isinstance(value, list): raise TypeError('values must be list') self._values = value
[docs] def to_dict(self) -> dict: """ Get the configuration object as a dictionary """ return { 'className': 'OneHot', 'params': { 'column': self.column, 'values': self.values } }
[docs]class DropColumn(BaseObject): """ Drop a column from tabular data Example usage: >>> import aisquared >>> preprocesser = aisquared.config.preprocessing.tabular.TabularPreprocesser() >>> preprocesser.add_step( aisquared.config.preprocessing.tabular.DropColumn( 3 ) ) """ def __init__( self, column: int ): """ Parameters ---------- column : int The column index to drop """ super().__init__() self.column = column @property def column(self): return self._column @column.setter def column(self, value): if not isinstance(value, int): raise ValueError('column must be integer valued') self._column = value
[docs] def to_dict(self) -> dict: """ Get the configuration object as a dictionary """ return { 'className': 'DropColumn', 'params': { 'column': self.column } }