Source code for singlet.io

# vim: fdm=indent
# author:     Fabio Zanini
# date:       14/08/17
# content:    Parse sample sheets.
# Modules
import numpy as np
import pandas as pd
from singlet.config import config


integrated_dataset_formats = ['loom']


# Parser
[docs]def parse_samplesheet(dictionary): from .csv import parse_samplesheet as parse_csv, csv_formats from .googleapi import parse_samplesheet as parse_googleapi if 'sheetname' in dictionary: sheet = config['io']['samplesheets'][dictionary['sheetname']] elif 'datasetname' in dictionary: sheet = config['io']['datasets'][dictionary['datasetname']]['samplesheet'] else: raise ValueError('Please specify a samplesheet or a dataset') if ('format' in sheet) and (sheet['format'] in csv_formats): table = parse_csv(sheet['path'], sheet['format']) elif 'url' in sheet: table = parse_googleapi(sheet) else: raise ValueError('samplesheet format not recognized') if ('cells' in sheet) and (sheet['cells'] != 'rows'): table = table.T if 'index' in sheet: index_col = sheet['index'] else: index_col = 'name' table.set_index(index_col, inplace=True, drop=True) return table
[docs]def parse_featuresheet(dictionary): from .csv import parse_featuresheet as parse_csv, csv_formats if 'sheetname' in dictionary: sheet = config['io']['featuresheets'][dictionary['sheetname']] elif 'datasetname' in dictionary: sheet = config['io']['datasets'][dictionary['datasetname']]['featuresheet'] else: raise ValueError('Please specify a featuresheet or a dataset') if sheet['format'] in csv_formats: table = parse_csv(sheet['path'], sheet['format']) else: raise ValueError('samplesheet format not recognized') if ('features' in sheet) and (sheet['features'] != 'rows'): table = table.T if 'index' in sheet: index_col = sheet['index'] else: index_col = 'name' table.set_index(index_col, inplace=True, drop=True) return table
[docs]def parse_counts_table(dictionary): from .csv import parse_counts_table as parse_csv, csv_formats from .pickle import parse_counts_table as parse_pickle if 'countsname' in dictionary: sheet = config['io']['count_tables'][dictionary['countsname']] elif 'datasetname' in dictionary: sheet = config['io']['datasets'][dictionary['datasetname']]['counts_table'] else: raise ValueError('Please specify a counts_table or a dataset') paths = sheet['path'] fmts = sheet['format'] if isinstance(paths, str): paths = [paths] fmts = [fmts] tables = [] for path, fmt in zip(paths, fmts): if fmt in csv_formats: parse = parse_csv elif fmt == 'pickle': parse = parse_pickle else: raise ValueError('Format not understood') table = parse(path, fmt) if ('cells' in sheet) and (sheet['cells'] != 'columns'): table = table.T if 'index' in sheet: table.set_index(sheet['index'], inplace=True, drop=True) elif not table.index.name: table.set_index(table.columns[0], inplace=True, drop=True) # Feature names are strings table.index = table.index.astype(str) # Counts are floats if sheet['bit_precision'] == 64: table = table.astype(np.float64) elif sheet['bit_precision'] == 128: table = table.astype(np.float128) elif sheet['bit_precision'] == 32: table = table.astype(np.float32) elif sheet['bit_precision'] == 16: table = table.astype(np.float16) else: raise ValueError('Bit precision must be one of 16, 32, 64, or 128') tables.append(table) if len(tables) == 1: table = tables[0] else: table = pd.concat(tables, axis=1) return table
[docs]def parse_counts_table_sparse(dictionary): from .npz import parse_counts_table_sparse as parse_npz if 'countsname' in dictionary: sheet = config['io']['count_tables'][dictionary['countsname']] elif 'datasetname' in dictionary: sheet = config['io']['datasets'][dictionary['datasetname']]['counts_table'] else: raise ValueError('Please specify a counts_table or a dataset') paths = sheet['path'] fmts = sheet['format'] if isinstance(paths, str): paths = [paths] fmts = [fmts] tables = [] for path, fmt in zip(paths, fmts): if fmt == 'npz': parse = parse_npz else: raise ValueError('Format not understood') table = parse(path, fmt) if ('cells' in sheet) and (sheet['cells'] != 'columns'): table = table.T if 'index' in sheet: table.set_index(sheet['index'], inplace=True, drop=True) elif not table.index.name: table.set_index(table.columns[0], inplace=True, drop=True) tables.append(table) if len(tables) == 1: table = tables[0] else: table = pd.concat(tables, axis=1) return table
[docs]def parse_dataset(dictionary): from .loom import parse_dataset as parse_loom if 'datasetname' in dictionary: dataset = config['io']['datasets'][dictionary['datasetname']] else: raise ValueError('A datasetname is required') if dataset['format'] == 'loom': return parse_loom( dataset['path'], dataset['axis_samples'], dataset['index_samples'], dataset['index_features'], bit_precision=dataset['bit_precision'], ) else: raise ValueError('Integrated dataset parsing supports the following formats: {:}'.format( ', '.join(integrated_dataset_formats)))