Source code for singlet.dataset.correlations

# vim: fdm=indent
# author:     Fabio Zanini
# date:       16/08/17
# content:    Dataset functions to correlate gene expression and phenotypes
# Modules
import numpy as np
import pandas as pd
from scipy.stats import rankdata
from .plugins import Plugin


# Classes / functions
[docs]class Correlation(Plugin):
    '''Correlate gene expression and phenotype in single cells'''

    @staticmethod
    def _correlate(x, y, method):
        if method == 'pearson':
            xw = x
            yw = y
        elif method == 'spearman':
            xw = np.zeros_like(x, float)
            for ii, xi in enumerate(x):
                xw[ii] = rankdata(xi, method='average')
            yw = np.zeros_like(y, float)
            for ii, yi in enumerate(y):
                yw[ii] = rankdata(yi, method='average')
        else:
            raise ValueError('correlation method not understood')

        xw = ((xw.T - xw.mean(axis=1)) / xw.std(axis=1)).T
        yw = ((yw.T - yw.mean(axis=1)) / yw.std(axis=1)).T
        n = xw.shape[1]
        r = np.dot(xw, yw.T) / n
        return r

[docs]    def correlate_samples(
            self,
            samples='all',
            samples2=None,
            phenotypes=None,
            method='spearman'):
        '''Correlate feature expression with one or more phenotypes.

        Args:
            samples (list or string): list of samples to correlate. Use a
                string for a single sample. The special string 'all'
                (default) uses all samples.
            samples2 (list or string): list of samples to correlate with.
                Use a string for a single sample. The special string
                'all' uses all samples. None (default) takes the same
                list as samples, returning a square matrix.
            method (string): type of correlation. Must be one of 'pearson' or
                'spearman'.
            phenotypes (list): phenotypes to include as additional features in
                the correlation calculation. None (default) means only feature
                counts are used.

        Returns:
            pandas.DataFrame with the correlation coefficients. If either
                samples or samples2 is a single string, the function
                returns a pandas.Series. If both are a string, it returns
                a single correlation coefficient.
        '''
        exp_all = self.dataset.counts.T

        if phenotypes is not None:
            for pheno in phenotypes:
                exp_all[pheno] = self.dataset.samplesheet[pheno].astype(float)

        if not isinstance(samples, str):
            exp = exp_all.loc[samples]
        elif samples == 'all':
            samples = exp_all.index
            exp = exp_all
        else:
            exp = exp_all.loc[[samples]]

        if samples2 is None:
            samples2 = samples
            exp2 = exp
        elif not isinstance(samples2, str):
            exp2 = exp_all.loc[samples2]
        elif samples2 == 'all':
            samples2 = exp_all.index
            exp2 = exp_all
        else:
            exp2 = exp_all.loc[[samples2]]

        x = exp.values
        y = exp2.values

        r = self._correlate(x, y, method=method)

        if (not isinstance(samples, str)) and (not isinstance(samples2, str)):
            return pd.DataFrame(
                    data=r,
                    index=exp.index,
                    columns=exp2.index,
                    dtype=float)
        elif isinstance(samples, str) and (not isinstance(samples2, str)):
            return pd.Series(
                    data=r[0],
                    index=exp2.index,
                    name='correlation',
                    dtype=float)
        elif (not isinstance(samples, str)) and isinstance(samples2, str):
            return pd.Series(
                    data=r[:, 0],
                    index=exp.index,
                    name='correlation',
                    dtype=float)
        else:
            return r[0, 0]

[docs]    def correlate_features_phenotypes(
            self,
            phenotypes,
            features='all',
            method='spearman',
            fillna=None,
            ):
        '''Correlate feature expression with one or more phenotypes.

        Args:
            phenotypes (list of string): list of phenotypes, i.e. columns of
                the samplesheet. Use a string for a single phenotype.
            features (list or string): list of features to correlate. Use a
                string for a single feature. The special string 'all'
                (default) uses all features.
            method (string): type of correlation. Must be one of 'pearson' or
                'spearman'.
            fillna (dict, int, or None): a dictionary with phenotypes as keys
                and numbers to fill for NaNs as values. None will do
                nothing.

        Returns:
            pandas.DataFrame with the correlation coefficients. If either
                phenotypes or features is a single string, the function
                returns a pandas.Series. If both are a string, it returns
                a single correlation coefficient.
        '''

        exp = self.dataset.counts
        if features != 'all':
            if isinstance(features, str):
                exp = exp.loc[[features]]
            else:
                exp = exp.loc[features]
        else:
            features = exp.index.tolist()

        phe = self.dataset.samplesheet
        if isinstance(phenotypes, str):
            phe = phe.loc[:, [phenotypes]]
        else:
            phe = phe.loc[:, phenotypes]

        if fillna is not None:
            phe = phe.copy()
            if np.isscalar(fillna):
                phe.fillna(fillna, inplace=True)
            else:
                for key, fna in fillna.items():
                    phe.loc[:, key].fillna(fna, inplace=True)

        x = exp.values
        y = phe.values.T

        r = self._correlate(x, y, method=method)

        if (not isinstance(features, str)) and (not isinstance(phenotypes, str)):
            return pd.DataFrame(
                    data=r,
                    index=exp.index,
                    columns=phe.columns,
                    dtype=float)
        elif isinstance(features, str) and (not isinstance(phenotypes, str)):
            return pd.Series(
                    data=r[0],
                    index=phe.columns,
                    name='correlation',
                    dtype=float)
        elif (not isinstance(features, str)) and isinstance(phenotypes, str):
            return pd.Series(
                    data=r[:, 0],
                    index=exp.index,
                    name='correlation',
                    dtype=float)
        else:
            return r[0, 0]

[docs]    def correlate_features_features(
            self,
            features='all',
            features2=None,
            method='spearman'):
        '''Correlate feature expression with one or more phenotypes.

        Args:
            features (list or string): list of features to correlate. Use a
                string for a single feature. The special string 'all'
                (default) uses all features.
            features2 (list or string): list of features to correlate with.
                Use a string for a single feature. The special string
                'all' uses all features. None (default) takes the same
                list as features, returning a square matrix.
            method (string): type of correlation. Must be one of 'pearson' or
                'spearman'.

        Returns:
            pandas.DataFrame with the correlation coefficients. If either
                features or features2 is a single string, the function
                returns a pandas.Series. If both are a string, it returns
                a single correlation coefficient.
        '''
        exp_all = self.dataset.counts
        if not isinstance(features, str):
            exp = exp_all.loc[features]
        elif features == 'all':
            features = exp_all.index
            exp = exp_all
        else:
            exp = exp_all.loc[[features]]

        if features2 is None:
            features = exp.index
            exp2 = exp
        elif not isinstance(features2, str):
            exp2 = exp_all.loc[features2]
        elif features2 == 'all':
            features = exp_all.index
            exp2 = exp_all
        else:
            exp2 = exp_all.loc[[features2]]

        x = exp.values
        y = exp2.values

        r = self._correlate(x, y, method=method)

        if (not isinstance(features, str)) and (not isinstance(features2, str)):
            return pd.DataFrame(
                    data=r,
                    index=exp.index,
                    columns=exp2.index,
                    dtype=float)
        elif isinstance(features, str) and (not isinstance(features2, str)):
            return pd.Series(
                    data=r[0],
                    index=exp2.index,
                    name='correlation',
                    dtype=float)
        elif (not isinstance(features, str)) and isinstance(features2, str):
            return pd.Series(
                    data=r[:, 0],
                    index=exp.index,
                    name='correlation',
                    dtype=float)
        else:
            return r[0, 0]

[docs]    def correlate_phenotypes_phenotypes(
            self,
            phenotypes,
            phenotypes2=None,
            method='spearman',
            fillna=None,
            fillna2=None,
            ):
        '''Correlate feature expression with one or more phenotypes.

        Args:
            phenotypes (list of string): list of phenotypes, i.e. columns of
                the samplesheet. Use a string for a single phenotype.
            phenotypes2 (list of string): list of phenotypes, i.e. columns of
                the samplesheet. Use a string for a single phenotype.
                None (default) uses the same as phenotypes.
            method (string): type of correlation. Must be one of 'pearson' or
                'spearman'.
            fillna (dict, int, or None): a dictionary with phenotypes as keys
                and numbers to fill for NaNs as values. None will do
                nothing, potentially yielding NaN as correlation
                coefficients.
            fillna2 (dict, int, or None): as fillna, but for phenotypes2.

        Returns:
            pandas.DataFrame with the correlation coefficients. If either
                phenotypes or features is a single string, the function
                returns a pandas.Series. If both are a string, it returns
                a single correlation coefficient.
        '''

        phe_all = self.dataset.samplesheet
        if isinstance(phenotypes, str):
            phe = phe_all.loc[:, [phenotypes]]
        else:
            phe = phe_all.loc[:, phenotypes]

        if phenotypes2 is None:
            phe2 = phe.copy()
        elif isinstance(phenotypes2, str):
            phe2 = phe_all.loc[:, [phenotypes2]]
        else:
            phe2 = phe_all.loc[:, phenotypes2]

        if fillna is not None:
            phe = phe.copy()
            if np.isscalar(fillna):
                phe.fillna(fillna, inplace=True)
            else:
                for key, fna in fillna.items():
                    phe.loc[:, key].fillna(fna, inplace=True)

        if fillna2 is not None:
            phe2 = phe2.copy()
            if np.isscalar(fillna2):
                phe2.fillna(fillna2, inplace=True)
            else:
                for key, fna in fillna2.items():
                    phe2.loc[:, key].fillna(fna, inplace=True)

        x = phe.values.T
        y = phe2.values.T

        r = self._correlate(x, y, method=method)

        if (not isinstance(phenotypes, str)) and (not isinstance(phenotypes2, str)):
            return pd.DataFrame(
                    data=r,
                    index=phe.columns,
                    columns=phe2.columns,
                    dtype=float)
        elif isinstance(phenotypes, str) and (not isinstance(phenotypes2, str)):
            return pd.Series(
                    data=r[0],
                    index=phe2.columns,
                    name='correlation',
                    dtype=float)
        elif (not isinstance(phenotypes, str)) and isinstance(phenotypes2, str):
            return pd.Series(
                    data=r[:, 0],
                    index=phe.columns,
                    name='correlation',
                    dtype=float)
        else:
            return r[0, 0]

#    def mutual_information(
#            self,
#            xs,
#            ys):
#        '''Mutual information between feature counts and/or phenotypes
#
#        Args:
#            xs (list or string): Features and/or phenotypes to use as
#                abscissa (independent variable). The string
#                'total' means all features including spikeins and other,
#                'mapped' means all features excluding spikeins and other,
#                'spikeins' means only spikeins, and 'other' means only
#                'other' features.
#            ys (list or string): Features and/or phenotypes to use as
#                ordinate (dependent variable). The string
#                'total' means all features including spikeins and other,
#                'mapped' means all features excluding spikeins and other,
#                'spikeins' means only spikeins, and 'other' means only
#                'other' features.
#
#        NOTE: Mutual information is defined only for discrete or categorical
#            variables and require a decent coverage of all bins or
#            categories because it has p(x)p(y) in the denominator.
#            Feature counts and quantitative phenotypes require binning
#            prior to calculating Mutual information. See CountsTable.bin
#            and SampleSheet.bin for options. This function uses
#            all unique values in the counts and phenotyes as separate
#            bins.
#        '''
#        datad = {'x': {'names': xs, 'data': []},
#                 'y': {'names': ys, 'data': []}}
#
#        counts = self.dataset.counts
#        sheet = self.dataset.samplesheet.T
#
#        for key, dic in datad.items():
#            args = dic['names']
#            if args == 'total':
#                counts_k = counts
#            elif args == 'mapped':
#                counts_k = counts.exclude_features(
#                        spikeins=True, other=True)
#            elif args == 'spikeins':
#                counts_k = counts.get_spikeins()
#            elif args == 'other':
#                counts_k = counts.get_other_features()
#            else:
#                counts_k = counts.loc[counts.index.isin(args)]
#
#            pheno_k = sheet.loc[sheet.index.isin(args)]
#
#            if len(counts_k):
#                dic['data'].append(counts_k)
#            if len(pheno_k):
#                dic['data'].append(pheno_k)
#
#            if len(dic['data']) == 0:
#                raise ValueError('Data for {:} are empty!'.format(key))
#            elif len(dic['data']) == 1:
#                dic['data'] = dic['data'][0]
#            else:
#                dic['data'] = pd.concat(dic['data'], axis=0)
#
#        # Prepare output structure
#        n_x = datad['x']['data'].shape[0]
#        n_y = datad['y']['data'].shape[0]
#        coords_x = datad['x']['data'].index
#        coords_y = datad['y']['data'].index
#        res = pd.DataFrame(
#                data=np.zeros((n_x, n_y)),
#                index=coords_x,
#                columns=coords_y)
#
#        for key_x, data_x in datad['x']['data'].iterrows():
#            for key_y, data_y in datad['y']['data'].iterrows():
#                px = data_x.value_counts(normalize=True, sort=False)
#                py = data_y.value_counts(normalize=True, sort=False)
#
#                data_xy = pd.concat([data_x, data_y], axis=0)
#
#                # FIXME: finish this
#                import ipdb; ipdb.set_trace() 
#
#
singlet

Navigation

Related Topics

Source code for singlet.dataset.correlations