Source code for aisquared.config.preprocessing.text.TextPreprocessing

from aisquared.base import BaseObject
from .Steps import Tokenize, RemoveCharacters, ConvertToCase, ConvertToVocabulary, PadSequences, Trim

ALLOWED_STEPS = (
    Tokenize,
    RemoveCharacters,
    ConvertToCase,
    ConvertToVocabulary,
    PadSequences,
    Trim
)


[docs]class TextPreprocesser(BaseObject):
    """
    Preprocesser object for natural language

    Example usage:

    >>> import aisquared
    >>> preprocesser = aisquared.config.preprocessing.text.TextPreprocesser()
    >>> preprocesser.add_step(
        aisquared.config.preprocessing.text.Tokenize()
    )
    """

    def __init__(
            self,
            steps: list = None
    ):
        """
        Parameters
        ----------
        steps : list or None (default None)
            List of preprocessing steps for natural language
        """
        super().__init__()
        self.steps = None
        if steps is not None:
            for step in steps:
                self.add_step(step)

    @property
    def step_dict(self):
        if self.steps is None:
            return self.steps
        else:
            return [
                step.to_dict() for step in self.steps
            ]

[docs]    def add_step(self, step):
        """
        Add a step to the preprocesser object
        """
        if not isinstance(step, ALLOWED_STEPS):
            raise TypeError(f'Each step must be one of {ALLOWED_STEPS}')
        if self.steps is None:
            self.steps = [step]
        else:
            self.steps = self.steps + [step]

[docs]    def to_dict(self) -> dict:
        """
        Get the configuration object as a dictionary
        """
        return {
            'className': 'TextPreprocessor',
            'steps': self.step_dict
        }