Source code for singlet.dataset

# vim: fdm=indent
# author:     Fabio Zanini
# date:       14/08/17
# content:    Dataset that combines feature counts with metadata.
# Modules
import numpy as np
import pandas as pd

from ..samplesheet import SampleSheet
from ..counts_table import CountsTable
from ..featuresheet import FeatureSheet


# Classes / functions
[docs]class Dataset():
    '''Collection of cells, with feature counts and metadata'''

    def __init__(self, counts_table=None, samplesheet=None, featuresheet=None):
        '''Collection of cells, with feature counts and metadata

        Args:
            counts_table (string): Name of the counts table (to load from a \
                    config file) or instance of CountsTable
            samplesheet (string or None): Name of the samplesheet (to load \
                    from a config file) or instance of SampleSheet
            featuresheet (string or None): Name of the samplesheet (to load \
                    from a config file) or instance of FeatureSheet

        NOTE: All samples in the counts_table must also be in the \
                samplesheet, but the latter can have additional samples. If \
                that is the case, the samplesheet is sliced down to the \
                samples present in the counts_table.
        '''
        from .correlations import Correlation
        from .plot import Plot
        from .dimensionality import DimensionalityReduction
        from .cluster import Cluster
        from .fit import Fit
        from .feature_selection import FeatureSelection

        # In general this class should be used for gene counts and phenotypes,
        # but we have to cover the corner cases that no counts or no phenotypes
        # are provided
        if counts_table is None:
            if samplesheet is None:
                raise ValueError('At least samplesheet or counts_table must be present')
            elif not isinstance(samplesheet, SampleSheet):
                self._samplesheet = SampleSheet.from_sheetname(samplesheet)
            else:
                self._samplesheet = samplesheet
            self._counts = CountsTable(data=[], index=[], columns=self._samplesheet.index)
        else:
            if not isinstance(counts_table, CountsTable):
                counts_table = CountsTable.from_tablename(counts_table)
            self._counts = counts_table

            if samplesheet is None:
                self._samplesheet = SampleSheet(data=[], index=self._counts.columns)
                self._samplesheet.sheetname = None
            elif not isinstance(samplesheet, SampleSheet):
                self._samplesheet = SampleSheet.from_sheetname(samplesheet)
            else:
                self._samplesheet = samplesheet

        if featuresheet is None:
            self._featuresheet = FeatureSheet(data=[], index=self._counts.index)
            self._featuresheet.sheetname = None
        elif not isinstance(featuresheet, FeatureSheet):
            self._featuresheet = FeatureSheet.from_sheetname(featuresheet)
        else:
            self._featuresheet = featuresheet

        # Uniform axes across data and metadata
        assert(self._counts.columns.isin(self._samplesheet.index).all())
        self._samplesheet = self._samplesheet.loc[self._counts.columns]
        # FIXME: this is very slow
        #assert(self._counts.index.isin(self._featuresheet.index).all())
        #self._featuresheet = self._featuresheet.loc[self._counts.index]

        # Plugins
        self.correlation = Correlation(self)
        self.plot = Plot(self)
        self.dimensionality = DimensionalityReduction(self)
        self.cluster = Cluster(self)
        self.fit = Fit(self)
        self.feature_selection = FeatureSelection(self)

    def __str__(self):
        return '{:} with {:} samples and {:} features'.format(
                self.__class__.__name__,
                self.n_samples,
                self.n_features)

    def __repr__(self):
        if not hasattr(self._counts, 'name'):
            ctn = 'None'
        else:
            ctn = '"{:}"'.format(self._counts.name)

        if self._samplesheet.sheetname is None:
            ssn = 'None'
        else:
            ssn = '"{:}"'.format(self._samplesheet.sheetname)

        if self._featuresheet.sheetname is None:
            fsn = 'None'
        else:
            fsn = '"{:}"'.format(self._featuresheet.sheetname)

        return '{:}({:}, {:}, {:})'.format(
                self.__class__.__name__,
                ctn,
                ssn,
                fsn,
                )

    def __eq__(self, other):
        if type(other) is not type(self):
            return False
        # FIXME: fillna(0) is sloppy but not so bad
        ss = (self._samplesheet.fillna(0) == other._samplesheet.fillna(0)).values.all()
        fs = (self._featuresheet.fillna(0) == other._featuresheet.fillna(0)).values.all()
        ct = (self._counts == other._counts).values.all()
        return ss and fs and ct

    def __ne__(self, other):
        return not self == other

    def __add__(self, other):
        '''Merge two Datasets.

        For samples with the same names, counts will be added and metadata of \
                one of the Datasets used. For new samples, the new counts and \
                metadata will be used.

        NOTE: metadata and gene names must be aligned for this operation to \
                succeed. If one of the two Datasets has more metadata or \
                features than the other, they cannot be added.
        '''
        selfcopy = self.copy()
        selfcopy += other
        return selfcopy

    def __iadd__(self, other):
        '''Merge two Datasets.

        For samples with the same names, counts will be added and metadata of \
                one of the Datasets used. For new samples, the new counts and \
                metadata will be used.

        NOTE: metadata and gene names must be aligned for this operation to \
                succeed. If one of the two Datasets has more metadata or \
                features than the other, they cannot be added.
        '''
        if set(self.metadatanames) != set(other.metadatanames):
            raise IndexError('The Datasets have different metadata')
        if set(self.featurenames) != set(other.featurenames):
            raise IndexError('The Datasets have different features')

        snames = self.samplenames
        for samplename, meta in other.samplesheet.iterrows():
            if samplename not in snames:
                self.samplesheet.loc[samplename] = meta
                self.counts.loc[:, samplename] = other.counts.loc[:, samplename]
            else:
                self.counts.loc[:, samplename] += other.counts.loc[:, samplename]

[docs]    def split(self, phenotypes, copy=True):
        '''Split Dataset based on one or more categorical phenotypes

        Args:
            phenotypes (string or list of strings): one or more phenotypes to \
                    use for the split. Unique values of combinations of these \
                    determine the split Datasets.

        Returns:
            dict of Datasets: the keys are either unique values of the \
                    phenotype chosen or, if more than one, tuples of unique \
                    combinations.
        '''
        from itertools import product

        if isinstance(phenotypes, str):
            phenotypes = [phenotypes]

        phenos_uniques = [tuple(set(self.samplesheet.loc[:, p])) for p in phenotypes]
        dss = {}
        for comb in product(*phenos_uniques):
            ind = np.ones(self.n_samples, bool)
            for p, val in zip(phenotypes, comb):
                ind &= self.samplesheet.loc[:, p] == val
            if ind.sum():
                samplesheet = self.samplesheet.loc[ind]
                counts = self.counts.loc[:, ind]

                if copy:
                    samplesheet = samplesheet.copy()
                    counts = counts.copy()

                if len(phenotypes) == 1:
                    label = comb[0]
                else:
                    label = comb

                dss[label] = self.__class__(
                        samplesheet=samplesheet,
                        counts_table=counts,
                        featuresheet=self.featuresheet,
                        )
        return dss

    @property
    def n_samples(self):
        '''Number of samples'''
        return self._samplesheet.shape[0]

    @property
    def n_features(self):
        '''Number of features'''
        return self._counts.shape[0]

    @property
    def samplenames(self):
        '''pandas.Index of sample names'''
        return self._samplesheet.index.copy()

    @property
    def featurenames(self):
        '''pandas.Index of feature names'''
        return self._counts.index.copy()

    @property
    def samplemetadatanames(self):
        '''pandas.Index of sample metadata column names'''
        return self._samplesheet.columns.copy()

    @property
    def featuremetadatanames(self):
        '''pandas.Index of feature metadata column names'''
        return self._featuresheet.columns.copy()

    @property
    def samplesheet(self):
        '''Matrix of sample metadata.

        Rows are samples, columns are metadata (e.g. phenotypes).
        '''
        return self._samplesheet

    @samplesheet.setter
    def samplesheet(self, value):
        self._counts = self._counts.loc[:, value.index]
        self._samplesheet = value

    @property
    def counts(self):
        '''Matrix of gene expression counts.

        Rows are features, columns are samples.

        Notice: If you reset this matrix with features that are not in the \
                featuresheet or samples that are not in the samplesheet, \
                those tables will be reset to empty.
        '''
        return self._counts

    @counts.setter
    def counts(self, value):
        try:
            self._samplesheet = self._samplesheet.loc[value.columns]
        except KeyError:
            self._samplesheet = SampleSheet(data=[], index=value.columns)

        try:
            self._featuresheet = self._featuresheet.loc[value.index]
        except KeyError:
            self._featuresheet = FeatureSheet(data=[], index=value.index)

        self._counts = value

    @property
    def featuresheet(self):
        '''Matrix of feature metadata.

        Rows are features, columns are metadata (e.g. Gene Ontologies).
        '''
        return self._featuresheet

    @featuresheet.setter
    def featuresheet(self, value):
        self._counts = self._counts.loc[value.index, :]
        self._featuresheet = value

[docs]    def copy(self):
        '''Copy of the Dataset including a new SampleSheet and CountsTable'''
        return self.__class__(
                counts_table=self._counts.copy(),
                samplesheet=self._samplesheet.copy(),
                featuresheet=self.featuresheet.copy())

[docs]    def query_samples_by_metadata(
            self,
            expression,
            inplace=False,
            local_dict=None):
        '''Select samples based on metadata.

        Args:
            expression (string): An expression compatible with pandas.DataFrame.query.
            inplace (bool): Whether to change the Dataset in place or return a new one.
            local_dict (dict): A dictionary of local variables, useful if you \
                    are using @var assignments in your expression. By far the \
                    most common usage of this argument is to set \
                    local_dict=locals().

        Returns:
            If `inplace` is True, None. Else, a Dataset.
        '''
        if inplace:
            self._samplesheet.query(
                    expression, inplace=True,
                    local_dict=local_dict)
            self._counts = self._counts.loc[:, self._samplesheet.index]
        else:
            samplesheet = self._samplesheet.query(
                    expression, inplace=False,
                    local_dict=local_dict)
            counts_table = self._counts.loc[:, samplesheet.index].copy()
            return self.__class__(
                    samplesheet=samplesheet,
                    counts_table=counts_table,
                    featuresheet=self._featuresheet.copy(),
                    )

[docs]    def query_features_by_metadata(
            self,
            expression,
            inplace=False,
            local_dict=None):
        '''Select features based on metadata.

        Args:
            expression (string): An expression compatible with pandas.DataFrame.query.
            inplace (bool): Whether to change the Dataset in place or return a new one.
            local_dict (dict): A dictionary of local variables, useful if you \
                    are using @var assignments in your expression. By far the \
                    most common usage of this argument is to set \
                    local_dict=locals().
        Returns:
            If `inplace` is True, None. Else, a Dataset.
        '''
        if inplace:
            self._featuresheet.query(
                    expression, inplace=True,
                    local_dict=local_dict)
            self._counts = self._counts.loc[self._featuresheet.index]
        else:
            featuresheet = self._featuresheet.query(
                    expression, inplace=False,
                    local_dict=local_dict)
            counts_table = self._counts.loc[featuresheet.index].copy()
            samplesheet = self._samplesheet.copy()
            return self.__class__(
                    samplesheet=samplesheet,
                    counts_table=counts_table,
                    featuresheet=featuresheet)

[docs]    def query_samples_by_counts(
            self, expression, inplace=False,
            local_dict=None):
        '''Select samples based on gene expression.

        Args:
            expression (string): An expression compatible with pandas.DataFrame.query.
            inplace (bool): Whether to change the Dataset in place or return a new one.
            local_dict (dict): A dictionary of local variables, useful if you \
                    are using @var assignments in your expression. By far the \
                    most common usage of this argument is to set \
                    local_dict=locals().
        Returns:
            If `inplace` is True, None. Else, a Dataset.
        '''
        counts = self._counts.copy()
        drop = []
        if ('total' in expression) and ('total' not in counts.index):
            counts.loc['total'] = counts.sum(axis=0)
            drop.append('total')
        if ('mapped' in expression) and ('mapped' not in counts.index):
            counts.loc['mapped'] = counts.exclude_features(spikeins=True, other=True).sum(axis=0)
            drop.append('mapped')

        counts_table = counts.T.query(
                expression, inplace=False,
                local_dict=local_dict).T
        if drop:
            counts_table.drop(drop, axis=0, inplace=True)

        if inplace:
            self.counts = counts_table
        else:
            samplesheet = self._samplesheet.loc[counts_table.columns].copy()
            return self.__class__(
                    samplesheet=samplesheet,
                    counts_table=counts_table,
                    featuresheet=self.featuresheet.copy())

[docs]    def query_features_by_counts(
            self, expression, inplace=False,
            local_dict=None):
        '''Select features based on their expression.

        Args:
            expression (string): An expression compatible with pandas.DataFrame.query.
            inplace (bool): Whether to change the Dataset in place or return a new one.
            local_dict (dict): A dictionary of local variables, useful if you \
                    are using @var assignments in your expression. By far the \
                    most common usage of this argument is to set \
                    local_dict=locals().
        Returns:
            If `inplace` is True, None. Else, a Dataset.
        '''
        if inplace:
            self._counts.query(
                    expression, inplace=True,
                    local_dict=local_dict)
            self._featuresheet = self._featuresheet.loc[self._counts.index]
        else:
            counts_table = self._counts.query(
                    expression, inplace=False,
                    local_dict=local_dict)
            samplesheet = self._samplesheet.copy()
            featuresheet = self._featuresheet.loc[counts_table.index].copy()
            return self.__class__(
                    samplesheet=samplesheet,
                    counts_table=counts_table,
                    featuresheet=featuresheet)

[docs]    def compare(
            self,
            other,
            features='mapped',
            phenotypes=(),
            method='kolmogorov-smirnov'):
        '''Statistically compare with another Dataset.

        Args:
            other (Dataset): The Dataset to compare with.
            features (list, string, or None): Features to compare. The string \
                    'total' means all features including spikeins and other, \
                    'mapped' means all features excluding spikeins and other, \
                    'spikeins' means only spikeins, and 'other' means only \
                    'other' features. If empty list or None, do not compare \
                    features (useful for phenotypic comparison).
            phenotypes (list of strings): Phenotypes to compare.
            method (string or function): Statistical test to use for the \
                    comparison. If a string it must be one of \
                    'kolmogorov-smirnov' or 'mann-whitney'. If a function, it \
                    must accept two arrays as arguments (one for each \
                    dataset, running over the samples) and return a P-value \
                    for the comparison.
        Return:
            A pandas.DataFrame containing the P-values of the comparisons for \
                    all features and phenotypes.
        '''

        pvalues = []
        if features:
            counts = self.counts
            counts_other = other.counts
            if features == 'total':
                pass
            elif features == 'mapped':
                counts = counts.exclude_features(
                        spikeins=True, other=True, errors='ignore')
                counts_other = counts_other.exclude_features(
                        spikeins=True, other=True, errors='ignore')
            elif features == 'spikeins':
                counts = counts.get_spikeins()
                counts_other = counts_other.get_spikeins()
            elif features == 'other':
                counts = counts.get_other_features()
                counts_other = counts_other.get_other_features()
            else:
                counts = counts.loc[features]
                counts_other = counts_other.loc[features]

            if method == 'kolmogorov-smirnov':
                from scipy.stats import ks_2samp
                for (fea, co1), (_, co2) in zip(
                        counts.iterrows(),
                        counts_other.iterrows()):
                    pvalues.append([fea, ks_2samp(co1.values, co2.values)[1]])

            elif method == 'mann-whitney':
                from scipy.stats import mannwhitneyu
                for (fea, co1), (_, co2) in zip(
                        counts.iterrows(),
                        counts_other.iterrows()):
                    # Mann-Whitney U has issues with ties
                    is_degenerate = False
                    if ((len(np.unique(co1.values)) == 1) or
                       (len(np.unique(co2.values)) == 1)):
                        is_degenerate = True
                    elif ((len(co1) == len(co2)) and
                          (np.sort(co1.values) == np.sort(co2.values)).all()):
                        is_degenerate = True
                    if is_degenerate:
                        pvalues.append([fea, 1])
                        continue
                    pvalues.append([fea, mannwhitneyu(
                        co1.values, co2.values,
                        alternative='two-sided')[1]])
            else:
                for (fea, co1) in counts.iterrows():
                    co2 = counts_other.loc[fea]
                    pvalues.append([fea, method(co1.values, co2.values)])

        if phenotypes:
            pheno = self.samplesheet.loc[:, phenotypes].T
            pheno_other = other.samplesheet.loc[:, phenotypes].T

            if method == 'kolmogorov-smirnov':
                from scipy.stats import ks_2samp
                for phe, val1 in pheno.iterrows():
                    val2 = pheno_other.loc[phe]
                    pvalues.append([phe, ks_2samp(val1.values, val2.values)[1]])
            elif method == 'mann-whitney':
                from scipy.stats import mannwhitneyu
                for phe, val1 in pheno.iterrows():
                    val2 = pheno_other.loc[phe]
                    # Mann-Whitney U has issues with ties
                    is_degenerate = False
                    if ((len(np.unique(val1.values)) == 1) or
                       (len(np.unique(val2.values)) == 1)):
                        is_degenerate = True
                    elif ((len(val1) == len(val2)) and
                          (np.sort(val1.values) == np.sort(val2.values)).all()):
                        is_degenerate = True
                    if is_degenerate:
                        pvalues.append([phe, 1])
                        continue
                    pvalues.append([phe, mannwhitneyu(
                        val1.values, val2.values,
                        alternative='two-sided')[1]])
            else:
                for phe, val1 in pheno.iterrows():
                    val2 = pheno_other.loc[phe]
                    pvalues.append([phe, method(val1.values, val2.values)])

        df = pd.DataFrame(pvalues, columns=['name', 'P-value'])
        df.set_index('name', drop=True, inplace=True)
        return df
singlet

Navigation

Related Topics

Source code for singlet.dataset