Source code for singlet.dataset.feature_selection

# vim: fdm=indent
# author:     Fabio Zanini
# date:       16/08/17
# content:    Dataset functions to feature selection
# Modules
import numpy as np
import pandas as pd
import xarray as xr


# Classes / functions
[docs]class FeatureSelection(): '''Plot gene expression and phenotype in single cells''' def __init__(self, dataset): '''Select features Args: dataset (Dataset): the dataset to analyze. ''' self.dataset = dataset
[docs] def unique( self, inplace=False): '''Select features with unique ids Args: inplace (bool): Whether to change the feature list in place. Returns: pd.Index of selected features if not inplace, else None. ''' from collections import Counter d = Counter(self.dataset._featuresheet.index) features = [f for f, count in d.items() if count == 1] if inplace: self.dataset.counts = self.dataset._counts.loc[features] else: return pd.Index(features, name=self.dataset._featuresheet.index.name)
[docs] def expressed( self, n_samples, exp_min, inplace=False): '''Select features that are expressed in at least some samples. Args: n_samples (int): Minimum number of samples the features should be expressed in. exp_min (float): Minimum level of expression of the features. inplace (bool): Whether to change the feature list in place. Returns: pd.Index of selected features if not inplace, else None. ''' ind = (self.dataset.counts >= exp_min).sum(axis=1) >= n_samples if inplace: self.dataset.counts = self.dataset.counts.loc[ind] else: return self.dataset.featurenames[ind]
[docs] def overdispersed_strata( self, bins=10, n_features_per_stratum=50, inplace=False): '''Select overdispersed features in strata of increasing expression. Args: bins (int or list): Bin edges determining the strata. If this is a number, split the expression in this many equally spaced bins between minimal and maximal expression. n_features_per_stratum (int): Number of features per stratum to select. Returns: pd.Index of selected features if not inplace, else None. Notice that the number of selected features may be smaller than expected if some strata have no dispersion (e.g. only dropouts). Because of this, it is recommended you restrict the counts to expressed features before using this function. ''' stats = self.dataset.counts.get_statistics(metrics=('mean', 'cv')) mean = stats.loc[:, 'mean'] if np.isscalar(bins): exp_min, exp_max = mean.values.min(), mean.values.max() bins = np.linspace(exp_min, exp_max, bins+1) features = [] for i in range(len(bins) - 1): if i == len(bins) - 2: cvi = stats.loc[mean >= bins[i], 'cv'] else: cvi = stats.loc[(mean >= bins[i]) & (mean < bins[i+1]), 'cv'] features.append(cvi.nlargest(n_features_per_stratum).index) features = pd.Index(np.concatenate(features), name=cvi.index.name) if inplace: self.dataset.counts = self.dataset.counts.loc[features] else: return features
[docs] def sam(self, k=None, distance='correlation', *args, **kwargs): '''Calculate feature weights via self-assembling manifolds Args: k (int or None): The number of nearest neighbors for each sample distance (str): The distance matrix *args, **kwargs: Arguments to SAM.run Returns: SAM instance containing SAM.output_vars['gene_weights'] See also: https://github.com/atarashansky/self-assembling-manifold ''' import SAM sam = SAM.SAM( counts=self.dataset.counts.T, k=k, distance=distance) sam.run(*args, **kwargs) return sam