Source code for singlet.dataset.feature_selection

# vim: fdm=indent
# author:     Fabio Zanini
# date:       16/08/17
# content:    Dataset functions to feature selection
# Modules
import numpy as np
import pandas as pd
import xarray as xr


# Classes / functions
[docs]class FeatureSelection():
    '''Plot gene expression and phenotype in single cells'''
    def __init__(self, dataset):
        '''Select features

        Args:
            dataset (Dataset): the dataset to analyze.
        '''
        self.dataset = dataset

[docs]    def expressed(
            self,
            n_samples,
            exp_min,
            inplace=False):
        '''Select features that are expressed in at least some samples.

        Args:
            n_samples (int): Minimum number of samples the features should be \
                    expressed in.
            exp_min (float): Minimum level of expression of the features.
            inplace (bool): Whether to change the feature list in place.

        Returns:
            pd.Index of selected features if not inplace, else None.
        '''
        ind = (self.dataset.counts >= exp_min).sum(axis=1) >= n_samples
        if inplace:
            self.dataset.counts = self.dataset.counts.loc[ind]
        else:
            return self.dataset.featurenames[ind]

[docs]    def overdispersed_strata(
            self, bins=10,
            n_features_per_stratum=50,
            inplace=False):
        '''Select overdispersed features in strata of increasing expression.

        Args:
            bins (int or list): Bin edges determining the strata. If this is \
                    a number, use that number of quantiles.
            n_features_per_stratum (int): Number of features per stratum to \
                    select.

        Returns:
            pd.Index of selected features if not inplace, else None.

        Notice that the number of selected features may be smaller than \
        expected if some strata have no dispersion (e.g. only dropouts). \
        Because of this, it is recommended you restrict the counts to \
        expressed features before using this function.
        '''

        stats = self.dataset.counts.get_statistics(metrics=('mean', 'cv'))
        mean = stats.loc[:, 'mean']

        if np.isscalar(bins):
            bins = mean.quantile(np.linspace(0, 1, bins+1)).values

        features = []
        for i in range(len(bins) - 1):
            if i == len(bins) - 2:
                cvi = stats.loc[mean >= bins[i], 'cv']
            else:
                cvi = stats.loc[(mean >= bins[i]) & (mean < bins[i+1]), 'cv']
            features.append(cvi.nlargest(n_features_per_stratum).index)
        features = pd.Index(np.concatenate(features), name=cvi.index.name)

        if inplace:
            self.dataset.counts = self.dataset.counts.loc[features]
        else:
            return features
singlet

Navigation

Related Topics

Source code for singlet.dataset.feature_selection