Source code for riboraptor.utils

from __future__ import (absolute_import, division, print_function,
                        unicode_literals)
import numpy as np
import glob
import re
import pickle
import os

import pandas as pd


def summary_starlogs_over_runs(directory, list_of_srr):
    df = pd.DataFrame()
    files_not_found = []
    for run in list_of_srr:
        if not os.path.isfile(os.path.join(directory, run + '.tsv')):
            files_not_found.append(run)
            continue
        temp_df = pd.read_table(os.path.join(directory, run + '.tsv'))
        df = pd.concat([df, temp_df])
    return df, files_not_found


[docs]def load_tpm(path): df = pd.read_table(path, names=['gene_id', 'tpm']).set_index('gene_id') return df
[docs]def get_cell_line_or_tissue(row): if str(row['cell_line']).strip() and str(row['cell_line']).strip() != 'nan': return '{}-{}-{}'.format(row['cell_line'], row['study_accession'], row['experiment_accession']) if str(row['tissue']).strip() and str(row['tissue']).strip() != 'nan': return '{}-{}-{}'.format(row['tissue'], row['study_accession'], row['experiment_accession']) if str(row['source_name']).strip( ) and str(row['source_name']).strip() != 'nan': return '{}-{}-{}'.format(row['source_name'], row['study_accession'], row['experiment_accession']) if row['study_accession'].strip() == 'SRP052229': print(row) return '{}-{}-{}'.format(row['source_name'], row['study_accession'], row['experiment_accession'])
[docs]def determine_cell_type(sample_attribute): sample_attribute = str(sample_attribute) if 'cell line:' in sample_attribute: x = re.search(r'cell line: \w+', sample_attribute) return x.group(0).strip('cell line: ').rstrip(' ').upper() if 'cell_line:' in sample_attribute: x = re.search(r'cell_line: \w+', sample_attribute) return x.group(0).strip('cell_line: ').rstrip(' ').upper() if 'cell-line:' in sample_attribute: x = re.search(r'cell-line: \w+', sample_attribute) return x.group(0).strip('cell-line: ').rstrip(' ').upper() if 'cell_type:' in sample_attribute: x = re.search(r'cell_type: \w+', sample_attribute) return x.group(0).strip('cell_type: ').rstrip(' ').upper() if 'source_name:' in sample_attribute: x = re.search(r'source_name: \w+', sample_attribute) return x.group(0).strip('source_name: ').rstrip(' ').upper() else: #pass print('Found {}'.format(sample_attribute)) return np.nan
[docs]def get_tissue_type(sample_attribute): sample_attribute = str(sample_attribute) if 'tissue: ' in sample_attribute: x = re.search(r'tissue: \w+', sample_attribute) return x.group(0).strip('tissue: ').rstrip(' ').lower() else: print('Found {}'.format(sample_attribute)) return np.nan
[docs]def get_strain_type(sample_attribute): sample_attribute = str(sample_attribute) if 'strain: ' in sample_attribute: x = re.search(r'strain: \w+', sample_attribute) return x.group(0).strip('strain: ').rstrip(' ').lower() else: print('Found {}'.format(sample_attribute)) return np.nan
[docs]def summary_starlogs_over_runs(directory, list_of_srr): df = pd.DataFrame() files_not_found = [] for run in list_of_srr: if not os.path.isfile(os.path.join(directory, run + '.tsv')): files_not_found.append(run) continue temp_df = pd.read_table(os.path.join(directory, run + '.tsv')) df = pd.concat([df, temp_df]) return df, files_not_found
[docs]def get_enrichment_cds_stats(pickle_file): data = pickle.load(open(pickle_file, 'rb')) mean = np.nanmean(data.values()) median = np.nanmedian(data.values()) stddev = np.nanstd(data.values()) minn = np.nanmin(data.values()) maxx = np.nanmax(data.values()) return minx, maxx, mean, median, stddev
[docs]def get_fragment_enrichment_score(txt_file): with open(txt_file) as fh: data = fh.read() enrichment = data.strip('\(').strip('\)').strip(' ').strip() enrichment, pval = enrichment.split(',') if 'nan' not in enrichment: return float(enrichment.strip('Enrichment: ')), float( pval.strip(')').strip('pval: ')) else: return np.nan, 1