Source code for singlet.dataset.fit

# vim: fdm=indent
# author:     Fabio Zanini
# date:       16/08/17
# content:    Dataset functions to plot gene expression and phenotypes
# Modules
import numpy as np
import pandas as pd
import xarray as xr


# Classes / functions
[docs]class Fit(): '''Plot gene expression and phenotype in single cells''' def __init__(self, dataset): '''Fit gene expression and phenotype in single cells Args: dataset (Dataset): the dataset to analyze. ''' self.dataset = dataset
[docs] def fit_single( self, xs, ys, model, method='least-squares', handle_nans='ignore', **kwargs): '''Fit feature expression or phenotypes against other. Args: xs (list or string): Features and/or phenotypes to use as \ abscissa (independent variable). The string \ 'total' means all features including spikeins and other, \ 'mapped' means all features excluding spikeins and other, \ 'spikeins' means only spikeins, and 'other' means only \ 'other' features. ys (list or string): Features and/or phenotypes to use as \ ordinate (dependent variable). The string \ 'total' means all features including spikeins and other, \ 'mapped' means all features excluding spikeins and other, \ 'spikeins' means only spikeins, and 'other' means only \ 'other' features. model (string or function): The model to use for fitting. If a \ string, it must be one of 'linear', 'threshold-linear', \ 'logistic'. If a function, it must accept an array \ as first argument (the x) and the parameters as \ additional arguments (like scipy.optimize.curve_fit). method (string or function): The minimization algorithm. For now, \ only 'least-squares' is accepted. In this case, the \ goodness of fit is the sum of the squared residues. handle_nans (string): How to deal with Not a Numbers, typically \ in the phenotypes. Must be either 'ignore' (default), in \ which case only the non-NaN samples will be used for \ fitting, or 'raise', in which case NaNs will stop the fit. **kwargs: Passed to the fit function. For nonlinear \ least-squares, this is scipy.optimize.curve_fit. Linear \ least-squares is analytical so it ignores **kwargs. Returns: A 3-dimensional xarray with the xs, ys as first two axes. The \ third axis, called 'results', contains the parameters \ and an assessment of the fit quality. If method is \ least-squres, it is the sum of squared residuals. NOTE: This function fits every combination of x and y independently, \ interactions are not considered. ''' datad = {'x': {'names': xs, 'data': []}, 'y': {'names': ys, 'data': []}} counts = self.dataset.counts sheet = self.dataset.samplesheet.T for key, dic in datad.items(): args = dic['names'] if args == 'total': counts_k = counts elif args == 'mapped': counts_k = counts.exclude_features( spikeins=True, other=True) elif args == 'spikeins': counts_k = counts.get_spikeins() elif args == 'other': counts_k = counts.get_other_features() else: counts_k = counts.loc[counts.index.isin(args)] pheno_k = sheet.loc[sheet.index.isin(args)] if len(counts_k): dic['data'].append(counts_k) if len(pheno_k): dic['data'].append(pheno_k) if len(dic['data']) == 0: raise ValueError('Data for {:} are empty!'.format(key)) elif len(dic['data']) == 1: dic['data'] = dic['data'][0] else: dic['data'] = pd.concat(dic['data'], axis=0) # Prepare output structure if model == 'linear': n_pars = 2 coords_par = ['slope', 'intercept'] elif model == 'threshold-linear': n_pars = 3 coords_par = ['baseline', 'intercept', 'slope'] elif model == 'logistic': n_pars = 4 coords_par = ['initial', 'final', 'mid-point', 'slope'] elif isinstance(model, str): raise ValueError('model format not accepted') else: from inspect import signature sig = signature(model) n_pars = len(sig.parameters) - 1 coords_par = [p.name for p in sig.parameters[1:]] if method == 'least-squares': gof_name = 'sum_squared_residuals' else: raise ValueError('Only least-squares is implemented') n_x = datad['x']['data'].shape[0] n_y = datad['y']['data'].shape[0] coords_x = datad['x']['data'].index.tolist() coords_y = datad['y']['data'].index.tolist() res = xr.DataArray( np.zeros((n_x, n_y, n_pars + 1)), dims=['x', 'y', 'parameters'], coords={'x': coords_x, 'y': coords_y, 'parameters': coords_par + [gof_name]}) # Set up fitting environment if method == 'least-squares': from scipy.optimize import curve_fit from scipy.stats import linregress if model == 'linear': def fun(x, s, i): return i + s * x elif model == 'threshold-linear': def fun(x, b, i, s): y = x.copy() t = (b - i) / s y[x <= t] = b y[x > t] = i + s * x return y elif model == 'logistic': def fun(x, i, f, mp, s): return i + (f - i) / (1 + np.exp(-s * (x - mp))) else: fun = model # Fit for key_x, data_x in datad['x']['data'].iterrows(): for key_y, data_y in datad['y']['data'].iterrows(): x = data_x.values y = data_y.values.astype(float) # Mask NaNs isnan = np.zeros_like(x, bool) isnan |= np.isnan(x) isnan |= np.isnan(y) if isnan.sum() != 0: if handle_nans == 'raise': raise ValueError('NaNs found in data.') else: x = x[~isnan] y = y[~isnan] if model == 'linear': popt = linregress(x, y)[:2] else: popt, pcov = curve_fit( fun, xdata=x, ydata=y, **kwargs) gof = ((y - fun(x, *popt))**2).sum() res_i = np.append(popt, gof) res.loc[{'x': key_x, 'y': key_y}] = res_i else: raise ValueError('Only least-squares is implemented') return res