Source code for singlet.dataset.dimensionality

# vim: fdm=indent
# author:     Fabio Zanini
# date:       16/08/17
# content:    Dataset functions to reduce dimensionality of gene expression
#             and phenotypes.
# Modules
import numpy as np
import pandas as pd

from ..utils.cache import method_caches


# Classes / functions
[docs]class DimensionalityReduction(): '''Reduce dimensionality of gene expression and phenotype in single cells''' def __init__(self, dataset): '''Reduce dimensionality of gene expression and phenotype in single cells Args: dataset (Dataset): the dataset to analyze. ''' self.dataset = dataset
[docs] @method_caches def pca(self, n_dims=2, transform='log10', robust=True, random_state=None): '''Principal component analysis Args: n_dims (int): Number of dimensions (2+). transform (string or None): Whether to preprocess the data. robust (bool): Whether to use Principal Component Pursuit to exclude outliers. Returns: dict of the left eigenvectors (vs), right eigenvectors (us) of the singular value decomposition, eigenvalues (lambdas), the transform, and the whiten function (for plotting). ''' from sklearn.decomposition import PCA X = self.dataset.counts.copy() pco = self.dataset.counts.pseudocount if transform == 'log10': X = np.log10(X + pco) elif transform == 'log2': X = np.log2(X + pco) elif transform == 'log': X = np.log(X + pco) whiten = lambda x: ((x.T - X.mean(axis=1)) / X.std(axis=1, ddof=0)).T Xnorm = whiten(X) # NaN (e.g. features that do not vary i.e. dropout) Xnorm[np.isnan(Xnorm)] = 0 if robust: #from numpy.linalg import matrix_rank #rank = matrix_rank(Xnorm.values) # Principal Component Pursuit (PSP) rpca = _RPCA(Xnorm.values) # L is low-rank, S is sparse (outliers) L, S = rpca.fit(max_iter=1000, iter_print=None) L = pd.DataFrame(L, index=X.index, columns=X.columns) whiten = lambda x: ((x.T - L.mean(axis=1)) / L.std(axis=1)).T Xnorm = whiten(L) Xnorm[np.isnan(Xnorm)] = 0 #print('rPCA: original rank:', rank, # 'reduced rank:', matrix_rank(L), # 'sparse rank:', matrix_rank(S)) pca = PCA(n_components=n_dims, random_state=random_state) vs = pd.DataFrame( pca.fit_transform(Xnorm.values.T), columns=['PC'+str(i+1) for i in range(pca.n_components)], index=X.columns) us = pd.DataFrame( pca.components_, index=vs.columns, columns=X.index).T return { 'vs': vs, 'us': us, 'eigenvalues': pca.explained_variance_ * Xnorm.shape[1], 'transform': pca.transform, 'whiten': whiten, }
[docs] @method_caches def tsne( self, n_dims=2, perplexity=30, theta=0.5, rand_seed=0, **kwargs): '''t-SNE algorithm. Args: n_dims (int): Number of dimensions to use. perplexity (float): Perplexity of the algorithm. theta (float): A number between 0 and 1. Higher is faster but less accurate (via the Barnes-Hut approximation). rand_seed (int): Random seed. -1 randomizes each run. **kwargs: Named arguments passed to the t-SNE algorithm. Returns: ''' # scikit-learn's <0.19 has a bug import sklearn vmaj, vmin, vrel = sklearn.__version__.split('.') if (int(vmaj) == 0) and (int(vmin) < 19): from bhtsne import tsne use_bhtsne = True else: from sklearn.manifold import TSNE use_bhtsne = False n = self.dataset.n_samples if(n - 1 < 3 * perplexity): raise ValueError('Perplexity too high, reduce to <= {:}'.format((n - 1.)/3)) X = self.dataset.counts.copy() if use_bhtsne: # this version does not require pre-whitening Y = tsne( data=X.values.T, dimensions=n_dims, perplexity=perplexity, theta=theta, rand_seed=rand_seed, **kwargs) else: Y = TSNE( n_components=n_dims, perplexity=perplexity, method='barnes_hut' if theta > 0 else 'exact', angle=theta, random_state=rand_seed, ).fit_transform( X.values.T ) vs = pd.DataFrame( Y, index=X.columns, columns=['dimension '+str(i+1) for i in range(n_dims)]) return vs
[docs] @method_caches def umap( self, n_dims=2, rand_seed=0, **kwargs): '''Uniform Manifold Approximation and Projection. Args: n_dims (int): Number of dimensions to use. rand_seed (int): Random seed. -1 randomizes each run. **kwargs: Named arguments passed to umap.UMAP. Returns: ''' from umap import UMAP X = self.dataset.counts Y = UMAP( n_components=n_dims, random_state=rand_seed, **kwargs).fit_transform( X.values.T ) vs = pd.DataFrame( Y, index=X.columns, columns=['dimension '+str(i+1) for i in range(n_dims)]) return vs
# Supplementary classes class _RPCA: '''from: https://github.com/dganguli/robust-pca''' def __init__(self, D, mu=None, lmbda=None): self.D = D self.S = np.zeros(self.D.shape) self.Y = np.zeros(self.D.shape) if mu: self.mu = mu else: self.mu = np.prod(self.D.shape) / (4 * self.norm_p(self.D, 2)) self.mu_inv = 1 / self.mu if lmbda: self.lmbda = lmbda else: self.lmbda = 1 / np.sqrt(np.max(self.D.shape)) @staticmethod def norm_p(M, p): return np.sum(np.power(M, p)) @staticmethod def shrink(M, tau): return np.sign(M) * np.maximum((np.abs(M) - tau), np.zeros(M.shape)) def svd_threshold(self, M, tau): U, S, V = np.linalg.svd(M, full_matrices=False) return np.dot(U, np.dot(np.diag(self.shrink(S, tau)), V)) def fit(self, tol=None, max_iter=1000, iter_print=None): ite = 0 err = np.Inf Sk = self.S Yk = self.Y Lk = np.zeros(self.D.shape) if tol: _tol = tol else: _tol = 1E-7 * self.norm_p(np.abs(self.D), 2) while (err > _tol) and ite < max_iter: Lk = self.svd_threshold( self.D - Sk + self.mu_inv * Yk, self.mu_inv) Sk = self.shrink( self.D - Lk + (self.mu_inv * Yk), self.mu_inv * self.lmbda) Yk = Yk + self.mu * (self.D - Lk - Sk) err = self.norm_p(np.abs(self.D - Lk - Sk), 2) ite += 1 if iter_print is None: continue if (ite % iter_print) == 0 or ite == 1 or ite > max_iter or err <= _tol: print('iteration: {0}, error: {1}'.format(ite, err)) self.L = Lk self.S = Sk return Lk, Sk def plot_fit(self, size=None, tol=0.1, axis_on=True): import matplotlib.pyplot as plt n, d = self.D.shape if size: nrows, ncols = size else: sq = np.ceil(np.sqrt(n)) nrows = int(sq) ncols = int(sq) ymin = np.nanmin(self.D) ymax = np.nanmax(self.D) print('ymin: {0}, ymax: {1}'.format(ymin, ymax)) numplots = np.min([n, nrows * ncols]) plt.figure() for n in range(numplots): plt.subplot(nrows, ncols, n + 1) plt.ylim((ymin - tol, ymax + tol)) plt.plot(self.L[n, :] + self.S[n, :], 'r') plt.plot(self.L[n, :], 'b') if not axis_on: plt.axis('off')