Source code for singlet.dataset.correlations

# vim: fdm=indent
# author:     Fabio Zanini
# date:       16/08/17
# content:    Dataset functions to correlate gene expression and phenotypes
# Modules
import numpy as np
import pandas as pd
from scipy.stats import rankdata


# Classes / functions
[docs]class Correlation():
    '''Correlate gene expression and phenotype in single cells'''
    def __init__(self, dataset):
        '''Correlate gene expression and phenotype in single cells

        Args:
            dataset (Dataset): the dataset to analyze.
        '''
        self.dataset = dataset

    @staticmethod
    def _correlate(x, y, method):
        if method == 'pearson':
            xw = x
            yw = y
        elif method == 'spearman':
            xw = np.zeros_like(x, float)
            for ii, xi in enumerate(x):
                xw[ii] = rankdata(xi, method='average')
            yw = np.zeros_like(y, float)
            for ii, yi in enumerate(y):
                yw[ii] = rankdata(yi, method='average')
        else:
            raise ValueError('correlation method not understood')

        xw = ((xw.T - xw.mean(axis=1)) / xw.std(axis=1)).T
        yw = ((yw.T - yw.mean(axis=1)) / yw.std(axis=1)).T
        n = xw.shape[1]
        r = np.dot(xw, yw.T) / n
        return r

[docs]    def correlate_samples(
            self,
            samples='all',
            samples2=None,
            phenotypes=None,
            method='spearman'):
        '''Correlate feature expression with one or more phenotypes.

        Args:
            samples (list or string): list of samples to correlate. Use a
                string for a single sample. The special string 'all'
                (default) uses all samples.
            samples2 (list or string): list of samples to correlate with.
                Use a string for a single sample. The special string
                'all' uses all samples. None (default) takes the same
                list as samples, returning a square matrix.
            method (string): type of correlation. Must be one of 'pearson' or
                'spearman'.
            phenotypes (list): phenotypes to include as additional features in
                the correlation calculation. None (default) means only feature
                counts are used.

        Returns:
            pandas.DataFrame with the correlation coefficients. If either
                samples or samples2 is a single string, the function
                returns a pandas.Series. If both are a string, it returns
                a single correlation coefficient.
        '''
        exp_all = self.dataset.counts.T

        if phenotypes is not None:
            for pheno in phenotypes:
                exp_all[pheno] = self.dataset.samplesheet[pheno].astype(float)

        if not isinstance(samples, str):
            exp = exp_all.loc[samples]
        elif samples == 'all':
            samples = exp_all.index
            exp = exp_all
        else:
            exp = exp_all.loc[[samples]]

        if samples2 is None:
            samples = exp.index
            exp2 = exp
        elif not isinstance(samples2, str):
            exp2 = exp_all.loc[samples2]
        elif samples2 == 'all':
            samples = exp_all.index
            exp2 = exp_all
        else:
            exp2 = exp_all.loc[[samples2]]

        x = exp.values
        y = exp2.values

        r = self._correlate(x, y, method=method)

        if (not isinstance(samples, str)) and (not isinstance(samples2, str)):
            return pd.DataFrame(
                    data=r,
                    index=exp.index,
                    columns=exp2.index,
                    dtype=float)
        elif isinstance(samples, str) and (not isinstance(samples, str)):
            return pd.Series(
                    data=r[0],
                    index=exp2.index,
                    name='correlation',
                    dtype=float)
        elif (not isinstance(samples, str)) and isinstance(samples, str):
            return pd.Series(
                    data=r[:, 0],
                    index=exp.index,
                    name='correlation',
                    dtype=float)
        else:
            return r[0, 0]

[docs]    def correlate_features_phenotypes(
            self,
            phenotypes,
            features='all',
            method='spearman',
            fillna=None,
            ):
        '''Correlate feature expression with one or more phenotypes.

        Args:
            phenotypes (list of string): list of phenotypes, i.e. columns of
                the samplesheet. Use a string for a single phenotype.
            features (list or string): list of features to correlate. Use a
                string for a single feature. The special string 'all'
                (default) uses all features.
            method (string): type of correlation. Must be one of 'pearson' or
                'spearman'.
            fillna (dict, int, or None): a dictionary with phenotypes as keys
                and numbers to fill for NaNs as values. None will do
                nothing.

        Returns:
            pandas.DataFrame with the correlation coefficients. If either
                phenotypes or features is a single string, the function
                returns a pandas.Series. If both are a string, it returns
                a single correlation coefficient.
        '''

        exp = self.dataset.counts
        if features != 'all':
            if isinstance(features, str):
                exp = exp.loc[[features]]
            else:
                exp = exp.loc[features]
        else:
            features = exp.index.tolist()

        phe = self.dataset.samplesheet
        if isinstance(phenotypes, str):
            phe = phe.loc[:, [phenotypes]]
        else:
            phe = phe.loc[:, phenotypes]

        if fillna is not None:
            phe = phe.copy()
            if np.isscalar(fillna):
                phe.fillna(fillna, inplace=True)
            else:
                for key, fna in fillna.items():
                    phe.loc[:, key].fillna(fna, inplace=True)

        x = exp.values
        y = phe.values.T

        r = self._correlate(x, y, method=method)

        if (not isinstance(features, str)) and (not isinstance(phenotypes, str)):
            return pd.DataFrame(
                    data=r,
                    index=exp.index,
                    columns=phe.columns,
                    dtype=float)
        elif isinstance(features, str) and (not isinstance(phenotypes, str)):
            return pd.Series(
                    data=r[0],
                    index=phe.columns,
                    name='correlation',
                    dtype=float)
        elif (not isinstance(features, str)) and isinstance(phenotypes, str):
            return pd.Series(
                    data=r[:, 0],
                    index=exp.index,
                    name='correlation',
                    dtype=float)
        else:
            return r[0, 0]

[docs]    def correlate_features_features(
            self,
            features='all',
            features2=None,
            method='spearman'):
        '''Correlate feature expression with one or more phenotypes.

        Args:
            features (list or string): list of features to correlate. Use a
                string for a single feature. The special string 'all'
                (default) uses all features.
            features2 (list or string): list of features to correlate with.
                Use a string for a single feature. The special string
                'all' uses all features. None (default) takes the same
                list as features, returning a square matrix.
            method (string): type of correlation. Must be one of 'pearson' or
                'spearman'.

        Returns:
            pandas.DataFrame with the correlation coefficients. If either
                features or features2 is a single string, the function
                returns a pandas.Series. If both are a string, it returns
                a single correlation coefficient.
        '''
        exp_all = self.dataset.counts
        if not isinstance(features, str):
            exp = exp_all.loc[features]
        elif features == 'all':
            features = exp_all.index
            exp = exp_all
        else:
            exp = exp_all.loc[[features]]

        if features2 is None:
            features = exp.index
            exp2 = exp
        elif not isinstance(features2, str):
            exp2 = exp_all.loc[features2]
        elif features2 == 'all':
            features = exp_all.index
            exp2 = exp_all
        else:
            exp2 = exp_all.loc[[features2]]

        x = exp.values
        y = exp2.values

        r = self._correlate(x, y, method=method)

        if (not isinstance(features, str)) and (not isinstance(features2, str)):
            return pd.DataFrame(
                    data=r,
                    index=exp.index,
                    columns=exp2.index,
                    dtype=float)
        elif isinstance(features, str) and (not isinstance(features2, str)):
            return pd.Series(
                    data=r[0],
                    index=exp2.index,
                    name='correlation',
                    dtype=float)
        elif (not isinstance(features, str)) and isinstance(features2, str):
            return pd.Series(
                    data=r[:, 0],
                    index=exp.index,
                    name='correlation',
                    dtype=float)
        else:
            return r[0, 0]

[docs]    def correlate_phenotypes_phenotypes(
            self,
            phenotypes,
            phenotypes2=None,
            method='spearman',
            fillna=None,
            fillna2=None,
            ):
        '''Correlate feature expression with one or more phenotypes.

        Args:
            phenotypes (list of string): list of phenotypes, i.e. columns of
                the samplesheet. Use a string for a single phenotype.
            phenotypes2 (list of string): list of phenotypes, i.e. columns of
                the samplesheet. Use a string for a single phenotype.
                None (default) uses the same as phenotypes.
            method (string): type of correlation. Must be one of 'pearson' or
                'spearman'.
            fillna (dict, int, or None): a dictionary with phenotypes as keys
                and numbers to fill for NaNs as values. None will do
                nothing, potentially yielding NaN as correlation
                coefficients.
            fillna2 (dict, int, or None): as fillna, but for phenotypes2.

        Returns:
            pandas.DataFrame with the correlation coefficients. If either
                phenotypes or features is a single string, the function
                returns a pandas.Series. If both are a string, it returns
                a single correlation coefficient.
        '''

        phe_all = self.dataset.samplesheet
        if isinstance(phenotypes, str):
            phe = phe_all.loc[:, [phenotypes]]
        else:
            phe = phe_all.loc[:, phenotypes]

        if phenotypes2 is None:
            phe2 = phe.copy()
        elif isinstance(phenotypes2, str):
            phe2 = phe_all.loc[:, [phenotypes2]]
        else:
            phe2 = phe_all.loc[:, phenotypes2]

        if fillna is not None:
            phe = phe.copy()
            if np.isscalar(fillna):
                phe.fillna(fillna, inplace=True)
            else:
                for key, fna in fillna.items():
                    phe.loc[:, key].fillna(fna, inplace=True)

        if fillna2 is not None:
            phe2 = phe2.copy()
            if np.isscalar(fillna2):
                phe2.fillna(fillna2, inplace=True)
            else:
                for key, fna in fillna2.items():
                    phe2.loc[:, key].fillna(fna, inplace=True)

        x = phe.values.T
        y = phe2.values.T

        r = self._correlate(x, y, method=method)

        if (not isinstance(phenotypes, str)) and (not isinstance(phenotypes2, str)):
            return pd.DataFrame(
                    data=r,
                    index=phe.columns,
                    columns=phe2.columns,
                    dtype=float)
        elif isinstance(phenotypes, str) and (not isinstance(phenotypes2, str)):
            return pd.Series(
                    data=r[0],
                    index=phe2.columns,
                    name='correlation',
                    dtype=float)
        elif (not isinstance(phenotypes, str)) and isinstance(phenotypes2, str):
            return pd.Series(
                    data=r[:, 0],
                    index=phe.columns,
                    name='correlation',
                    dtype=float)
        else:
            return r[0, 0]

[docs]    def mutual_information(
            self,
            xs,
            ys):
        '''Mutual information between feature counts and/or phenotypes

        Args:
            xs (list or string): Features and/or phenotypes to use as
                abscissa (independent variable). The string
                'total' means all features including spikeins and other,
                'mapped' means all features excluding spikeins and other,
                'spikeins' means only spikeins, and 'other' means only
                'other' features.
            ys (list or string): Features and/or phenotypes to use as
                ordinate (dependent variable). The string
                'total' means all features including spikeins and other,
                'mapped' means all features excluding spikeins and other,
                'spikeins' means only spikeins, and 'other' means only
                'other' features.

        NOTE: Mutual information is defined only for discrete or categorical
            variables and require a decent coverage of all bins or
            categories because it has p(x)p(y) in the denominator.
            Feature counts and quantitative phenotypes require binning
            prior to calculating Mutual information. See CountsTable.bin
            and SampleSheet.bin for options. This function uses
            all unique values in the counts and phenotyes as separate
            bins.
        '''
        datad = {'x': {'names': xs, 'data': []},
                 'y': {'names': ys, 'data': []}}

        counts = self.dataset.counts
        sheet = self.dataset.samplesheet.T

        for key, dic in datad.items():
            args = dic['names']
            if args == 'total':
                counts_k = counts
            elif args == 'mapped':
                counts_k = counts.exclude_features(
                        spikeins=True, other=True)
            elif args == 'spikeins':
                counts_k = counts.get_spikeins()
            elif args == 'other':
                counts_k = counts.get_other_features()
            else:
                counts_k = counts.loc[counts.index.isin(args)]

            pheno_k = sheet.loc[sheet.index.isin(args)]

            if len(counts_k):
                dic['data'].append(counts_k)
            if len(pheno_k):
                dic['data'].append(pheno_k)

            if len(dic['data']) == 0:
                raise ValueError('Data for {:} are empty!'.format(key))
            elif len(dic['data']) == 1:
                dic['data'] = dic['data'][0]
            else:
                dic['data'] = pd.concat(dic['data'], axis=0)

        # Prepare output structure
        n_x = datad['x']['data'].shape[0]
        n_y = datad['y']['data'].shape[0]
        coords_x = datad['x']['data'].index
        coords_y = datad['y']['data'].index
        res = pd.DataFrame(
                data=np.zeros((n_x, n_y)),
                index=coords_x,
                columns=coords_y)

        for key_x, data_x in datad['x']['data'].iterrows():
            for key_y, data_y in datad['y']['data'].iterrows():
                px = data_x.value_counts(normalize=True, sort=False)
                py = data_y.value_counts(normalize=True, sort=False)

                data_xy = pd.concat([data_x, data_y], axis=0)

                # FIXME: finish this
                import ipdb; ipdb.set_trace()
singlet

Navigation

Related Topics

Source code for singlet.dataset.correlations