Source code for aisquared.config.harvesting.TextHarvester

from typing import Union
from aisquared.base import BaseObject, ALLOWED_HOWS


[docs]class TextHarvester(BaseObject): """ Object to harvest text Example usage: >>> import aisquared >>> my_obj = aisquared.config.harvesting.TextHarvester( how = 'all', body_only = True ) >>> my_obj.to_dict() {'className': 'TextHarvester', 'params': {'how': 'all', 'regex': None, 'flags': 'gu', 'bodyOnly': True, 'limit': None}} """ def __init__( self, how: str = ALLOWED_HOWS[0], regex: str = None, flags: str = 'gu', body_only: bool = False, keywords: Union[str, list] = None, limit: int = None ): """ Parameters ---------- how : str (default 'all') How to harvest text (supports ['all', 'regex', 'keywords']) regex : str or None (default None) Javascript-compatible regular expression to use to harvest individual strings flags : str or None (default 'gu') Flags to use with the Regex body_only : bool (default False) Whether to only harvest text from the body of the webpage keywords : list of str or None (default None) If provided, keywords to search for limit : int or None (default None) The limit to the number of times to perform the harvesting, if provided """ super().__init__() self.how = how self.regex = regex self.flags = flags self.body_only = body_only self.keywords = keywords self.limit = limit @property def how(self): return self._how @how.setter def how(self, value): if value not in ALLOWED_HOWS: raise ValueError( f"how must be one of {ALLOWED_HOWS}, got {value}") self._how = value @property def regex(self): return self._regex @regex.setter def regex(self, value): self._regex = str(value) if value is not None else value @property def flags(self): return self._flags @flags.setter def flags(self, value): self._flags = value @property def body_only(self): return self._body_only @body_only.setter def body_only(self, value): if not isinstance(value, bool): raise TypeError('body_only must be Boolean') self._body_only = value @property def limit(self): return self._limit @limit.setter def limit(self, value): if not isinstance(value, int) and value is not None: raise TypeError('limit must be int or None') self._limit = value
[docs] def to_dict(self) -> dict: """ Get the configuration object as a dictionary """ if self.how == 'keywords': return { 'className': 'TextHarvester', 'params': { 'how': 'regex', 'regex': '|'.join(self.keywords), 'flags': self.flags, 'bodyOnly': self.body_only, 'limit': self.limit } } return { 'className': 'TextHarvester', 'params': { 'how': self.how, 'regex': self.regex, 'flags': self.flags, 'bodyOnly': self.body_only, 'limit': self.limit } }