Source code for baloo.core.strings

from .series import Series
from .utils import check_type
from ..weld import weld_str_lower, weld_str_upper, weld_str_capitalize, weld_str_get, weld_str_strip, weld_str_slice, \
    weld_str_contains, weld_to_numpy_dtype, WeldBit, weld_str_startswith, weld_str_endswith, weld_str_find, WeldLong, \
    weld_str_replace, weld_str_split


[docs]class StringMethods(object): def __init__(self, data): self._data = check_type(data, Series)
[docs] def lower(self): """Convert all characters to lowercase. Returns ------- Series """ return _series_str_result(self, weld_str_lower)
[docs] def upper(self): """Convert all characters to uppercase. Returns ------- Series """ return _series_str_result(self, weld_str_upper)
[docs] def capitalize(self): """Convert first character to uppercase and remainder to lowercase. Returns ------- Series """ return _series_str_result(self, weld_str_capitalize)
[docs] def get(self, i): """Extract i'th character of each element. Parameters ---------- i : int Returns ------- Series """ check_type(i, int) return _series_str_result(self, weld_str_get, i=i)
[docs] def strip(self): """Strip whitespace from start and end of each element. Note it currently only looks for whitespace (ASCII 32), not tabs or EOL. Returns ------- Series """ return _series_str_result(self, weld_str_strip)
[docs] def slice(self, start=None, stop=None, step=None): """Slice substrings from each element. Note that negative step is currently not supported. Parameters ---------- start : int stop : int step : int Returns ------- Series """ check_type(start, int) check_type(stop, int) check_type(step, int) if step is not None and step < 0: raise ValueError('Only positive steps are currently supported') return _series_str_result(self, weld_str_slice, start=start, stop=stop, step=step)
[docs] def contains(self, pat): """Test if pat is included within elements. Parameters ---------- pat : str Returns ------- Series """ check_type(pat, str) return _series_bool_result(self, weld_str_contains, pat=pat)
[docs] def startswith(self, pat): """Test if elements start with pat. Parameters ---------- pat : str Returns ------- Series """ check_type(pat, str) return _series_bool_result(self, weld_str_startswith, pat=pat)
[docs] def endswith(self, pat): """Test if elements end with pat. Parameters ---------- pat : str Returns ------- Series """ check_type(pat, str) return _series_bool_result(self, weld_str_endswith, pat=pat)
[docs] def find(self, sub, start=0, end=None): """Test if elements contain substring. Parameters ---------- sub : str start : int, optional Index to start searching from. end : int, optional Index to stop searching from. Returns ------- Series """ check_type(sub, str) check_type(start, int) check_type(end, int) if end is not None and start >= end: raise ValueError('End must be greater than start') return Series(weld_str_find(self._data.values, sub, start, end), self._data.index, weld_to_numpy_dtype(WeldLong()), self._data.name)
# TODO: replace multiple occurrences, not just first
[docs] def replace(self, pat, rep): """Replace first occurrence of pat with rep in each element. Parameters ---------- pat : str rep : str Returns ------- Series """ check_type(pat, str) check_type(rep, str) return _series_str_result(self, weld_str_replace, pat=pat, rep=rep)
# TODO: rsplit
[docs] def split(self, pat, side='left'): """Split once each element from the left and select a side to return. Note this is unlike pandas split in that it essentially combines the split with a select. Parameters ---------- pat : str side : {'left', 'right'} Which side of the split to select and return in each element. Returns ------- Series """ check_type(pat, str) check_type(side, str) # don't want this made with the object _split_mapping = { 'left': 0, 'right': 1 } if side not in _split_mapping: raise ValueError('Can only select left or right side of split') return _series_str_result(self, weld_str_split, pat=pat, side=_split_mapping[side])
def _series_str_result(series, func, **kwargs): return Series(func(series._data.values, **kwargs), series._data.index, series._data.dtype, series._data.name) def _series_bool_result(series, func, **kwargs): return Series(func(series._data.values, **kwargs), series._data.index, weld_to_numpy_dtype(WeldBit()), series._data.name)