Source code for aisquared.config.preprocessing.text.Steps

from typing import Union
from aisquared.base import BaseObject, ALLOWED_PADS


[docs]class Tokenize(BaseObject): """Preprocessing Step to tokenize text Example usage: >>> import aisquared >>> preprocesser = aisquared.config.preprocessing.text.TextPreprocesser() >>> preprocesser.add_step( aisquared.config.preprocessing.text.Tokenize() ) """ def __init__( self, split_sentences: bool = False, split_words: bool = True, token_pattern: str = '\b\w\w+\b' ): """ Parameters ---------- split_sentences : bool (default False) Whether to split on sentences first split_words : bool (default True) Whether to split on words token_pattern : str (default '(?u)\\b\\w\\w+\\b') Regex to tokenize on """ super().__init__() self.split_sentences = split_sentences self.split_words = split_words self.token_pattern = token_pattern @property def split_sentences(self): return self._split_sentences @split_sentences.setter def split_sentences(self, value): if not isinstance(value, bool): raise TypeError('split_sentences must be bool') self._split_sentences = value @property def split_words(self): return self._split_words @split_words.setter def split_words(self, value): if not isinstance(value, bool): raise TypeError('split_words must be bool') self._split_words = value @property def token_pattern(self): return self._token_pattern @token_pattern.setter def token_pattern(self, value): if not isinstance(value, str): raise TypeError('token_pattern must be string') self._token_pattern = value
[docs] def to_dict(self) -> dict: """ Get the configuration object as a dictionary """ return { 'className': 'Tokenize', 'params': { 'splitSentences': self.split_sentences, 'splitWords': self.split_words, 'tokenPattern': self.token_pattern } }
[docs]class RemoveCharacters(BaseObject): """Preprocessing step to remove characters from text Example usage: >>> import aisquared >>> preprocesser = aisquared.config.preprocessing.text.TextPreprocesser() >>> preprocesser.add_step( aisquared.config.preprocessing.text.RemoveCharacters() ) """ def __init__( self, remove_digits: bool = True, remove_punctuation: bool = True ): """ Parameters ---------- remove_digits : bool (default True) Whether to remove digits from input text remove_punctuation : bool (default True) Whether to remove punctuation from input text """ super().__init__() self.remove_digits = remove_digits self.remove_punctuation = remove_punctuation @property def remove_digits(self): return self._remove_digits @remove_digits.setter def remove_digits(self, value): if not isinstance(value, bool): raise TypeError('remove_digits must be bool') self._remove_digits = value @property def remove_punctuation(self): return self._remove_punctuation @remove_punctuation.setter def remove_punctuation(self, value): if not isinstance(value, bool): raise TypeError('remove_punctuation must be bool') self._remove_punctuation = value
[docs] def to_dict(self) -> dict: """ Get the configuration object as a dictionary """ return { 'className': 'RemoveCharacters', 'params': { 'removeDigits': self.remove_digits, 'removePunctuation': self.remove_punctuation } }
[docs]class ConvertToCase(BaseObject): """Text preprocessing object to convert inputs to all lowercase or all uppercase Example usage: >>> import aisquared >>> preprocesser = aisquared.config.preprocessing.text.TextPreprocesser() >>> preprocesser.add_step( aisquared.config.preprocessing.text.ConvertToCase() ) """ def __init__( self, lowercase: bool = True ): """ Parameters ---------- lowercase : bool (default True) Whether to convert to lower case. If False, converts to all uppercase """ super().__init__() self.lowercase = lowercase @property def lowercase(self): return self._lowercase @lowercase.setter def lowercase(self, value): if not isinstance(value, bool): raise TypeError('lowercase must be bool') self._lowercase = value
[docs] def to_dict(self) -> dict: """ Get the configuration object as a dictionary """ return { 'className': 'ConvertToCase', 'params': { 'lowercase': self.lowercase } }
[docs]class ConvertToVocabulary(BaseObject): """Text preprocessing object to convert tokens to integer vocabularies Example usage: >>> import aisquared >>> preprocesser = aisquared.config.preprocessing.text.TextPreprocesser() >>> preprocesser.add_step( aisquared.config.preprocessing.text.ConvertToVocabulary( { 'test' : 3, 'vocabulary' : 4 } ) ) """ def __init__( self, vocabulary: dict, start_character: int = 1, oov_character: int = 2, max_vocab: int = None ): """ Parameters ---------- vocabulary : dict Dictionary of string -> integer mappings start_character : int (default 1) The character to use for the start of an input sequence oov_character : int (default 2) The character to use for out of vocabulary tokens max_vocab : int or None (default None) The maximum vocabulary integer to use. If None, all vocabulary are used """ super().__init__() self.vocabulary = vocabulary self.start_character = start_character self.oov_character = oov_character self.max_vocab = max_vocab @property def vocabulary(self): return self._vocabulary @vocabulary.setter def vocabulary(self, value): if not isinstance(value, dict): raise TypeError('vocabulary must be dictionary') if not all([isinstance(k, str) for k in value.keys()]): raise ValueError('All keys in vocabulary must be strings') if not all([isinstance(v, int) for v in value.values()]): raise ValueError('All values in vocabulary must be integers') self._vocabulary = value @property def start_character(self): return self._start_character @start_character.setter def start_character(self, value): if not isinstance(value, int): raise TypeError('start_character must be int') self._start_character = value @property def oov_character(self): return self._oov_character @oov_character.setter def oov_character(self, value): if not isinstance(value, int): raise TypeError('oov_character must be int') self._oov_character = value @property def max_vocab(self): return self._max_vocab @max_vocab.setter def max_vocab(self, value): if value is not None: if not isinstance(value, int): raise TypeError('max_vocab must be int') self._max_vocab = value
[docs] def to_dict(self) -> dict: """ Get the configuration object as a dictionary """ return { 'className': 'ConvertToVocabulary', 'params': { 'vocabulary': self.vocabulary, 'startCharacter': self.start_character, 'oovCharacter': self.oov_character, 'maxVocab': self.max_vocab } }
[docs]class PadSequences(BaseObject): """Text preprocessing object to pad sequences Example usage: >>> import aisquared >>> preprocesser = aisquared.config.preprocessing.text.TextPreprocesser() >>> preprocesser.add_step( aisquared.config.preprocessing.text.PadSequences() ) """ def __init__( self, pad_character: int = 0, length: int = 128, pad_location: str = 'post', truncate_location: str = 'post' ): """ Parameters ---------- pad_character : int (default 0) The character to use for padding length : int (default 128) The length to pad sequences to pad_location : str (default 'post') One of either 'pre' or 'post', corresponding to how sequences are to be padded truncate_location : str (default 'post') One of either 'pre' or 'post', corresponding to how sequences are to be truncated """ super().__init__() self.pad_character = pad_character self.length = length self.pad_location = pad_location self.truncate_location = truncate_location @property def pad_character(self): return self._pad_character @pad_character.setter def pad_character(self, value): if not isinstance(value, int): raise TypeError('pad_character must be int') self._pad_character = value @property def length(self): return self._length @length.setter def length(self, value): if not isinstance(value, int): raise TypeError('length must be int') if value <= 0: raise ValueError('length must be greater than 0') self._length = value @property def pad_location(self): return self._pad_location @pad_location.setter def pad_location(self, value): if not isinstance(value, str): raise TypeError('pad_location must be str') if value not in ALLOWED_PADS: raise ValueError(f'pad_location must be one of {ALLOWED_PADS}') self._pad_location = value @property def truncate_location(self): return self._truncate_location @truncate_location.setter def truncate_location(self, value): if not isinstance(value, str): raise TypeError('truncate_location must be str') if value not in ALLOWED_PADS: raise ValueError( f'truncate_location must be one of {ALLOWED_PADS}') self._truncate_location = value
[docs] def to_dict(self) -> dict: """ Get the configuration object as a dictionary """ return { 'className': 'PadSequences', 'params': { 'padCharacter': self.pad_character, 'length': self.length, 'padLocation': self.pad_location, 'truncateLocation': self.truncate_location } }
[docs]class Trim(BaseObject): """Text preprocessing class to trim whitespace from text Example usage: >>> import aisquared >>> preprocesser = aisquared.config.preprocessing.text.TextPreprocesser() >>> preprocesser.add_step( aisquared.config.preprocessing.text.Trim() ) """ def __init__(self): super().__init__()
[docs] def to_dict(self) -> dict: """ Get the configuration object as a dictionary """ return { 'className': 'Trim', 'params': {} }