Source code for baloo.core.indexes.base

import numpy as np

from ..generic import BinaryOps, IndexCommon, BalooCommon, BitOps
from ...core.utils import check_type, infer_dtype, is_scalar, check_weld_bit_array, check_valid_int_slice, \
    convert_to_numpy, check_dtype
from ...weld import LazyArrayResult, numpy_to_weld_type, weld_filter, weld_slice, \
    weld_compare, weld_tail, weld_array_op, weld_element_wise_op, WeldObject, weld_iloc_indices, \
    weld_iloc_indices_with_missing, default_missing_data_literal, weld_replace


[docs]class Index(LazyArrayResult, BinaryOps, BitOps, IndexCommon, BalooCommon): """Weld-ed Pandas Index. Attributes ---------- dtype name See Also -------- pandas.Index : https://pandas.pydata.org/pandas-docs/stable/generated/pandas.Index.html Examples -------- >>> import baloo as bl >>> import numpy as np >>> ind = bl.Index(np.array(['a', 'b', 'c'], dtype=np.dtype(np.bytes_))) >>> ind # repr Index(name=None, dtype=|S1) >>> print(ind) # str [b'a' b'b' b'c'] >>> ind.values array([b'a', b'b', b'c'], dtype='|S1') >>> len(ind) # eager 3 """
[docs] def __init__(self, data, dtype=None, name=None): """Initialize an Index object. Parameters ---------- data : np.ndarray or WeldObject or list Raw data or Weld expression. dtype : np.dtype, optional Numpy dtype of the elements. Inferred from `data` by default. name : str, optional Name of the Index. """ data, dtype = _process_input_data(data, dtype) self.dtype = dtype self.name = check_type(name, str) self._length = len(data) if isinstance(data, np.ndarray) else None super(Index, self).__init__(data, numpy_to_weld_type(self.dtype))
def __repr__(self): return "{}(name={}, dtype={})".format(self.__class__.__name__, self.name, self.dtype) def _comparison(self, other, comparison): if other is None: other = default_missing_data_literal(self.weld_type) return _index_compare(self, other, comparison) elif is_scalar(other): return _index_compare(self, other, comparison) else: raise TypeError('Can currently only compare with scalars') def _bitwise_operation(self, other, operation): check_type(other, LazyArrayResult) check_weld_bit_array(other) check_weld_bit_array(self) return Index(weld_array_op(self.weld_expr, other.weld_expr, self.weld_type, operation), self.dtype, self.name) def _element_wise_operation(self, other, operation): # Pandas converts result to a Series; unclear why atm if isinstance(other, LazyArrayResult): return Index(weld_array_op(self.weld_expr, other.weld_expr, self.weld_type, operation), self.dtype, self.name) elif is_scalar(other): return Index(weld_element_wise_op(self.weld_expr, self.weld_type, other, operation), self.dtype, self.name) else: raise TypeError('Can only apply operation with scalar or LazyArrayResult') @property def name(self): return self._name @name.setter def name(self, value): self._name = value def _gather_names(self, name='index'): return [name if self.name is None else self.name] def _gather_data_for_weld(self): return [self.weld_expr] def _gather_weld_types(self): return [self.weld_type] def _gather_data(self, name='index'): return {self._gather_names(name)[0]: self} def _iloc_indices(self, indices): return Index(weld_iloc_indices(self.weld_expr, self.weld_type, indices), self.dtype, self.name) def _iloc_indices_with_missing(self, indices): return Index(weld_iloc_indices_with_missing(self.weld_expr, self.weld_type, indices), self.dtype, self.name) def astype(self, dtype): check_dtype(dtype) return Index(self._astype(dtype), dtype, self.name)
[docs] def __getitem__(self, item): """Select from the Index. Currently used internally through DataFrame and Series. Supported selection functionality exemplified below. Examples -------- >>> ind = bl.Index(np.arange(3)) >>> print(ind[ind < 2].evaluate()) [0 1] >>> print(ind[1:2].evaluate()) [1] """ if isinstance(item, LazyArrayResult): check_weld_bit_array(item) return Index(weld_filter(self.weld_expr, self.weld_type, item.weld_expr), self.dtype, self.name) elif isinstance(item, slice): check_valid_int_slice(item) if self.empty: return self else: return Index(weld_slice(self.weld_expr, self.weld_type, item), self.dtype, self.name) else: raise TypeError('Expected LazyArrayResult or slice')
[docs] def evaluate(self, verbose=False, decode=True, passes=None, num_threads=1, apply_experimental=True): """Evaluates by creating an Index containing evaluated data. See `LazyResult` Returns ------- Index Index with evaluated data. """ evaluated_data = super(Index, self).evaluate(verbose, decode, passes, num_threads, apply_experimental) return Index(evaluated_data, self.dtype, self.name)
[docs] def head(self, n=5): """Return Index with first n values. Parameters ---------- n : int Number of values. Returns ------- Series Index containing the first n values. Examples -------- >>> ind = bl.Index(np.arange(3, dtype=np.float64)) >>> print(ind.head(2).evaluate()) [0. 1.] """ return self[:n]
[docs] def tail(self, n=5): """Return Index with the last n values. Parameters ---------- n : int Number of values. Returns ------- Series Index containing the last n values. Examples -------- >>> ind = bl.Index(np.arange(3, dtype=np.float64)) >>> print(ind.tail(2).evaluate()) [1. 2.] """ if self.empty: return self else: if self._length is not None: length = self._length else: length = self._lazy_len().weld_expr # not computing slice here to use with __getitem__ because we'd need to use len which is eager return Index(weld_tail(self.weld_expr, length, n), self.dtype, self.name)
[docs] def dropna(self): """Returns Index without null values according to Baloo's convention. Returns ------- Index Index with no null values. """ return self[self.notna()]
[docs] def fillna(self, value): """Returns Index with missing values replaced with value. Parameters ---------- value : {int, float, bytes, bool} Scalar value to replace missing values with. Returns ------- Index With missing values replaced. """ if not is_scalar(value): raise TypeError('Value to replace with is not a valid scalar') return Index(weld_replace(self.weld_expr, self.weld_type, default_missing_data_literal(self.weld_type), value), self.dtype, self.name)
[docs] @classmethod def from_pandas(cls, index): """Create baloo Index from pandas Index. Parameters ---------- index : pandas.base.Index Returns ------- Index """ from pandas import Index as PandasIndex check_type(index, PandasIndex) return Index(index.values, index.dtype, index.name)
[docs] def to_pandas(self): """Convert to pandas Index. Returns ------- pandas.base.Index """ if not self.is_raw(): raise ValueError('Cannot convert to pandas Index if not evaluated.') from pandas import Index as PandasIndex return PandasIndex(self.values, self.dtype, name=self.name)
def _process_input_data(data, dtype): check_type(data, (np.ndarray, WeldObject, list)) if isinstance(data, list): data = convert_to_numpy(data) inferred_dtype = infer_dtype(data, dtype) if isinstance(data, np.ndarray) and data.dtype.char != inferred_dtype.char: data = data.astype(inferred_dtype) return data, inferred_dtype def _index_compare(index, other, comparison): return Index(weld_compare(index.weld_expr, other, comparison, index.weld_type), np.dtype(np.bool), index.name)