from warnings import warn
import numpy as np
from scipy import sparse
[docs]def asarray(a):
"""convenience - turn np.matrix to np.array including dim reduction"""
return np.array(a).squeeze()
[docs]def density(x):
return x.getnnz() / (x.shape[0] * x.shape[1])
def _sum_false(msk, axis):
if axis is None:
top = msk.shape[0] * msk.shape[1]
else:
top = msk.shape[axis]
return asarray(top - msk.sum(axis=axis))
[docs]def make_nan_mask(x):
nans = np.isnan(x.data)
msk = sparse.csc_matrix((nans, x.indices, x.indptr), shape=x.shape).copy()
msk.eliminate_zeros()
return msk
[docs]def mean(x, axis=None, mask=None, safe=False, **kwargs):
if mask is None:
m = np.mean(x, axis=axis, **kwargs)
if np.isnan(m).sum() > 0:
warn('Result contains nans. Consider adding a nan mask')
return asarray(m)
assert x.shape == mask.shape, 'x and mask must have the same shape'
assert mask.dtype == 'bool', 'mask must be boolean'
if safe:
warn("Masking is safe making sure original matrix is zeroed. May be slow")
xcp = x.copy()
xcp[mask] = 0
xcp.eliminate_zeros()
else:
xcp = x
s = xcp.sum(axis=axis, )
c = _sum_false(mask, axis=axis)
return asarray(s / c)
[docs]def variance(x, axis=None, mask=None, **kwargs):
"""
Returns variance by axis or for entire sparse matrix
Parameters
----------
mask
x : sparse.csr_matrix
matrix to compute variance for
axis : int or None
axis to return variance for, or None if for entire matrix
kwargs
passed to np.mean
Returns
-------
var_ : array_like
array of ndim=1 if axis is given or 0 dim (scalar) if axis is None
"""
L = mean(x.power(2), axis=axis, mask=mask, **kwargs)
R = np.power(mean(x, axis=axis, mask=mask, **kwargs), 2)
var_ = asarray(L - R)
return var_
# cov and corr
def _cov_block(x, y=None, mask=None):
'''Uses cov(x,y) = e(xy) - e(x)e(y)'''
xmean = mean(x, axis=0, mask=mask).reshape(-1, 1)
if y is None:
y = x
ymean = xmean.T
else:
assert x.shape[0] == y.shape[0], 'x and y must have same number of rows'
ymean = mean(y, axis=0, mask=mask).reshape(-1, 1).T
R = xmean.dot(ymean).squeeze()
L = x.T.dot(y)
L = L / x.shape[0]
#
return L - R
[docs]def cov(x, y=None, mask=None, blocks=1):
"""
Parameters
----------
x : sparse matrix
Data Matrix
y : array_like
target array
mask : sparse matrix [bool]
mask of values to consider as nan
blocks : int
amount of blocks of computing (for large matrices)
Returns
-------
ret
covariance vector if y in given, also auto-covariance for x
"""
if blocks == 1:
return _cov_block(x, y, mask)
else:
raise NotImplementedError
# TODO Blocks Code
# rows, cols = m.shape
# block_size = rows // blocks
# ret = np.empty((cols, cols))
# for i in tqdm_notebook(range(blocks)):
# mi =
# _cov_block()
def _autocorr(x, mask=None):
'''
Returns a correlation matrix for the features of matrix x
Parameters
----------
x : sparse matrix
data matrix
mask : sparse matrix [bool]
mask of values to consider as nan
Returns
-------
np.ndarray
correlation matrix ndim=2
'''
cv = cov(x, mask=mask)
dv = np.sqrt(np.diag(cv))
corr = cv / dv[:, None]
corr = corr / dv[None, :]
return corr
def _corr_target(x, y, mask=None):
'''
Return a correlation vector between matrix x and target column y
Parameters
----------
x : sparse matrix
Data Matrix
y : array_like
target array
mask : sparse matrix [bool]
mask of values to consider as nan
Returns
-------
np.ndarray
vector of correlation to target column ndim=1
'''
cv = cov(x, y, mask=mask)
xvar = variance(x, axis=0, mask=mask)
yvar = y.var()
return cv / np.sqrt(xvar * yvar)
[docs]def corr(x, y=None, mask=None):
'''
Return a correlation vector between matrix x and target column y if given,
else a auto-correlation matrix for the features of matrix x
Parameters
----------
x : sparse matrix
Data Matrix
y : array_like
target array
mask : sparse matrix [bool]
mask of values to consider as nan
Returns
-------
ret
correlation vector if y in given, also auto-correlation for x
'''
if y is None:
ret = _autocorr(x, mask=mask)
else:
ret = _corr_target(x, y, mask=mask)
return np.array(ret)
# Indexing
[docs]def non_zero_index(x, axis, as_bool=True):
"""
return the index of all rows/features that are not all zero
Parameters
----------
x : sparse matrix
data matrix
axis : int
axis to return indices for
mask : sparse matrix [bool]
mask of values to consider as nan
as_bool : bool
whether to return a mask of bool indices or vector of numbers
Returns
-------
cond : array_like
Either an array with number of indices or boolean mask
"""
abs_ = abs(x).sum(axis=axis)
cond = asarray(abs_)
if as_bool:
return cond.astype(bool)
else:
return cond.nonzero()[0]
[docs]def non_constant_index(x, axis, mask=None, as_bool=True, threshold=0, method='variance'):
"""
Returns the indices of the non constant (informative) rows/features
Parameters
----------
x : sparse matrix
data matrix
axis : int
axis to return indices for
mask : sparse matrix [bool]
mask of values to consider as nan
as_bool : bool
whether to return a mask of bool indices or vector of numbers
threshold : numeric
decided constant by the feature variance, can be larger than 0 for
"almost-constant" features
Returns
-------
cond : array_like
Either an array with number of indices or boolean mask
"""
if (threshold == 0) and (mask is None):
return non_zero_index(x, axis, as_bool)
elif method == 'variance':
cond = variance(x, axis=axis, mask=mask) > threshold
elif method == 'nnz':
cond = x.getnnz(axis=axis) > threshold
else:
raise KeyError
if as_bool:
return cond
else:
return np.where(cond)[0]
[docs]def constant_index(x, axis, mask=None, as_bool=True, threshold=0):
"""
Returns the indices of the constant rows/features
Parameters
----------
x : sparse matrix
data matrix
axis : int
axis to return indices for
mask : sparse matrix [bool]
mask of values to consider as nan
as_bool : bool
whether to return a mask of bool indices or vector of numbers
threshold : numeric
decided constant by the feature variance, can be larger than 0 for
"almost-constant" features
Returns
-------
cond : array_like
Either an array with number of indices or boolean mask
"""
cond = variance(x, axis=axis, mask=mask) <= threshold
if as_bool:
return cond
else:
return np.where(cond)[0]