Source code for singlet.dataset.fit

# vim: fdm=indent
# author:     Fabio Zanini
# date:       16/08/17
# content:    Dataset functions to plot gene expression and phenotypes
# Modules
import numpy as np
import pandas as pd
import xarray as xr


# Classes / functions
[docs]class Fit():
    '''Plot gene expression and phenotype in single cells'''
    def __init__(self, dataset):
        '''Fit gene expression and phenotype in single cells

        Args:
            dataset (Dataset): the dataset to analyze.
        '''
        self.dataset = dataset

[docs]    def fit_single(
            self,
            xs,
            ys,
            model,
            method='least-squares',
            handle_nans='ignore',
            **kwargs):
        '''Fit feature expression or phenotypes against other.

        Args:
            xs (list or string): Features and/or phenotypes to use as \
                    abscissa (independent variable). The string \
                    'total' means all features including spikeins and other, \
                    'mapped' means all features excluding spikeins and other, \
                    'spikeins' means only spikeins, and 'other' means only \
                    'other' features.
            ys (list or string): Features and/or phenotypes to use as \
                    ordinate (dependent variable). The string \
                    'total' means all features including spikeins and other, \
                    'mapped' means all features excluding spikeins and other, \
                    'spikeins' means only spikeins, and 'other' means only \
                    'other' features.
            model (string or function): The model to use for fitting. If a \
                    string, it must be one of 'linear', 'threshold-linear', \
                    'logistic'. If a function, it must accept an array \
                    as first argument (the x) and the parameters as \
                    additional arguments (like scipy.optimize.curve_fit).
            method (string or function): The minimization algorithm. For now, \
                    only 'least-squares' is accepted. In this case, the \
                    goodness of fit is the sum of the squared residues.
            handle_nans (string): How to deal with Not a Numbers, typically \
                    in the phenotypes. Must be either 'ignore' (default), in \
                    which case only the non-NaN samples will be used for \
                    fitting, or 'raise', in which case NaNs will stop the fit.
            **kwargs: Passed to the fit function. For nonlinear \
                    least-squares, this is scipy.optimize.curve_fit. Linear \
                    least-squares is analytical so it ignores **kwargs.

        Returns:
            A 3-dimensional xarray with the xs, ys as first two axes. The \
                    third axis, called 'results', contains the parameters \
                    and an assessment of the fit quality. If method is \
                    least-squres, it is the sum of squared residuals.

        NOTE: This function fits every combination of x and y independently, \
                interactions are not considered.
        '''
        datad = {'x': {'names': xs, 'data': []},
                 'y': {'names': ys, 'data': []}}

        counts = self.dataset.counts
        sheet = self.dataset.samplesheet.T

        for key, dic in datad.items():
            args = dic['names']
            if args == 'total':
                counts_k = counts
            elif args == 'mapped':
                counts_k = counts.exclude_features(
                        spikeins=True, other=True)
            elif args == 'spikeins':
                counts_k = counts.get_spikeins()
            elif args == 'other':
                counts_k = counts.get_other_features()
            else:
                counts_k = counts.loc[counts.index.isin(args)]

            pheno_k = sheet.loc[sheet.index.isin(args)]

            if len(counts_k):
                dic['data'].append(counts_k)
            if len(pheno_k):
                dic['data'].append(pheno_k)

            if len(dic['data']) == 0:
                raise ValueError('Data for {:} are empty!'.format(key))
            elif len(dic['data']) == 1:
                dic['data'] = dic['data'][0]
            else:
                dic['data'] = pd.concat(dic['data'], axis=0)

        # Prepare output structure
        if model == 'linear':
            n_pars = 2
            coords_par = ['slope', 'intercept']
        elif model == 'threshold-linear':
            n_pars = 3
            coords_par = ['baseline', 'intercept', 'slope']
        elif model == 'logistic':
            n_pars = 4
            coords_par = ['initial', 'final', 'mid-point', 'slope']
        elif isinstance(model, str):
            raise ValueError('model format not accepted')
        else:
            from inspect import signature
            sig = signature(model)
            n_pars = len(sig.parameters) - 1
            coords_par = [p.name for p in sig.parameters[1:]]

        if method == 'least-squares':
            gof_name = 'sum_squared_residuals'
        else:
            raise ValueError('Only least-squares is implemented')

        n_x = datad['x']['data'].shape[0]
        n_y = datad['y']['data'].shape[0]
        coords_x = datad['x']['data'].index.tolist()
        coords_y = datad['y']['data'].index.tolist()

        res = xr.DataArray(
                np.zeros((n_x, n_y, n_pars + 1)),
                dims=['x', 'y', 'parameters'],
                coords={'x': coords_x, 'y': coords_y,
                        'parameters': coords_par + [gof_name]})

        # Set up fitting environment
        if method == 'least-squares':
            from scipy.optimize import curve_fit
            from scipy.stats import linregress

            if model == 'linear':
                def fun(x, s, i):
                    return i + s * x
            elif model == 'threshold-linear':
                def fun(x, b, i, s):
                    y = x.copy()
                    t = (b - i) / s
                    y[x <= t] = b
                    y[x > t] = i + s * x
                    return y
            elif model == 'logistic':
                def fun(x, i, f, mp, s):
                    return i + (f - i) / (1 + np.exp(-s * (x - mp)))
            else:
                fun = model

            # Fit
            for key_x, data_x in datad['x']['data'].iterrows():
                for key_y, data_y in datad['y']['data'].iterrows():
                    x = data_x.values
                    y = data_y.values.astype(float)

                    # Mask NaNs
                    isnan = np.zeros_like(x, bool)
                    isnan |= np.isnan(x)
                    isnan |= np.isnan(y)
                    if isnan.sum() != 0:
                        if handle_nans == 'raise':
                            raise ValueError('NaNs found in data.')
                        else:
                            x = x[~isnan]
                            y = y[~isnan]

                    if model == 'linear':
                        popt = linregress(x, y)[:2]
                    else:
                        popt, pcov = curve_fit(
                                fun,
                                xdata=x,
                                ydata=y,
                                **kwargs)

                    gof = ((y - fun(x, *popt))**2).sum()

                    res_i = np.append(popt, gof)

                    res.loc[{'x': key_x, 'y': key_y}] = res_i
        else:
            raise ValueError('Only least-squares is implemented')

        return res
singlet

Navigation

Related Topics

Source code for singlet.dataset.fit