Source code for singlet.dataset.cluster

# vim: fdm=indent
# author:     Fabio Zanini
# date:       16/08/17
# content:    Dataset functions to cluster samples, features, and phenotypes.
# Modules
import numpy as np
import pandas as pd

from ..utils.cache import method_caches


# Classes / functions
[docs]class Cluster(): '''Cluster samples, features, and phenotypes''' def __init__(self, dataset): '''Cluster samples, features, and phenotypes Args: dataset (Dataset): the dataset to analyze. ''' self.dataset = dataset # NOTE: caching this one is tricky because it has non-kwargs AND it would # need a double cache, one for cells and one for features/phenotypes
[docs] def hierarchical( self, axis, phenotypes=(), metric='correlation', method='average', log_features=True, optimal_ordering=False, **kwargs): '''Hierarchical clustering. Args: axis (string): It must be 'samples' or 'features'. \ The Dataset.counts matrix is used and \ either samples or features are clustered. phenotypes (iterable of strings): Phenotypes to add to the \ features for joint clustering. metric (string): Metric to calculate the distance matrix. Should \ be a string accepted by scipy.spatial.distance.pdist. method (string): Clustering method. Must be a string accepted by \ scipy.cluster.hierarchy.linkage. log_features (bool): Whether to add pseudocounts and take a log \ of the feature counts before calculating distances. optimal_ordering (bool): Whether to resort the linkage so that \ nearest neighbours have shortest distance. This may take \ longer than the clustering itself. Return: dict with the linkage, distance matrix, and ordering. ''' from scipy.spatial.distance import pdist from scipy.cluster.hierarchy import linkage, leaves_list if optimal_ordering: try: from polo import optimal_leaf_ordering except ImportError: raise ImportError( 'The package "polo" is needed for optimal leaf ordering') data = self.dataset.counts if log_features: data = np.log10(self.dataset.counts.pseudocount + data) if phenotypes is not None: data = data.copy() for pheno in phenotypes: data.loc[pheno] = self.dataset.samplesheet.loc[:, pheno] if axis == 'samples': data = data.T elif axis == 'features': pass else: raise ValueError('axis must be "samples" or "features"') Y = pdist(data.values, metric=metric) # Some metrics (e.g. correlation) give nan whenever the matrix has no # variation, default this to zero distance (e.g. two features that are # both total dropouts. Y = np.nan_to_num(Y) Z = linkage(Y, method=method) if optimal_ordering: Z = optimal_leaf_ordering(Z, Y) ids = data.index[leaves_list(Z)] return { 'distance': Y, 'linkage': Z, 'leaves': ids, }