Source code for singlet.dataset.feature_selection
# vim: fdm=indent
# author: Fabio Zanini
# date: 16/08/17
# content: Dataset functions to feature selection
# Modules
import numpy as np
import pandas as pd
import xarray as xr
# Classes / functions
[docs]class FeatureSelection():
'''Plot gene expression and phenotype in single cells'''
def __init__(self, dataset):
'''Select features
Args:
dataset (Dataset): the dataset to analyze.
'''
self.dataset = dataset
[docs] def expressed(
self,
n_samples,
exp_min,
inplace=False):
'''Select features that are expressed in at least some samples.
Args:
n_samples (int): Minimum number of samples the features should be \
expressed in.
exp_min (float): Minimum level of expression of the features.
inplace (bool): Whether to change the feature list in place.
Returns:
pd.Index of selected features if not inplace, else None.
'''
ind = (self.dataset.counts >= exp_min).sum(axis=1) >= n_samples
if inplace:
self.dataset.counts = self.dataset.counts.loc[ind]
else:
return self.dataset.featurenames[ind]
[docs] def overdispersed_strata(
self, bins=10,
n_features_per_stratum=50,
inplace=False):
'''Select overdispersed features in strata of increasing expression.
Args:
bins (int or list): Bin edges determining the strata. If this is \
a number, use that number of quantiles.
n_features_per_stratum (int): Number of features per stratum to \
select.
Returns:
pd.Index of selected features if not inplace, else None.
Notice that the number of selected features may be smaller than \
expected if some strata have no dispersion (e.g. only dropouts). \
Because of this, it is recommended you restrict the counts to \
expressed features before using this function.
'''
stats = self.dataset.counts.get_statistics(metrics=('mean', 'cv'))
mean = stats.loc[:, 'mean']
if np.isscalar(bins):
bins = mean.quantile(np.linspace(0, 1, bins+1)).values
features = []
for i in range(len(bins) - 1):
if i == len(bins) - 2:
cvi = stats.loc[mean >= bins[i], 'cv']
else:
cvi = stats.loc[(mean >= bins[i]) & (mean < bins[i+1]), 'cv']
features.append(cvi.nlargest(n_features_per_stratum).index)
features = pd.Index(np.concatenate(features), name=cvi.index.name)
if inplace:
self.dataset.counts = self.dataset.counts.loc[features]
else:
return features