Source code for sctriangulate.preprocessing

import sys
import os
import math
import copy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import rankdata
import multiprocessing as mp
import scanpy as sc
import anndata as ad
from scipy.io import mmread,mmwrite
from scipy.sparse import csr_matrix,issparse
import matplotlib as mpl
from functools import reduce
from sklearn.decomposition import PCA
import umap
from tqdm import tqdm
import gzip
from io import BytesIO,StringIO

from .colors import *



# for publication ready figure
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42
mpl.rcParams['font.family'] = 'Arial'


def sctriangulate_preprocessing_setting(backend='Agg',png=False):
    # change the backend
    mpl.use(backend)
    if png:
        # for publication and super large dataset
        mpl.rcParams['savefig.dpi'] = 600
        mpl.rcParams['figure.dpi'] = 600


[docs]def small_txt_to_adata(int_file,gene_is_index=True,sep='\t'):
    '''
    given a small dense expression (<2GB) txt file, load them into memory as adata, and also make sure the X is sparse matrix.

    :param int_file: string, path to the input txt file, delimited by tab
    :param gene_is_index: boolean, whether the gene/features are the index
    :param sep: the delimieter, default is tab

    :return: AnnData

    Exmaples::

        from sctriangulate.preprocessing import small_txt_to_adata
        adata= = small_txt_to_adata('./input.txt',gene_is_index=True)

    '''
    df = pd.read_csv(int_file,sep=sep,index_col=0)
    if gene_is_index:
        adata = ad.AnnData(X=csr_matrix(df.values.T),var=pd.DataFrame(index=df.index.values),obs=pd.DataFrame(index=df.columns.values))
    else:
        adata = ad.AnnData(X=csr_matrix(df.values),var=pd.DataFrame(index=df.columns.values),obs=pd.DataFrame(index=df.index.values))
    adata.var_names_make_unique()
    adata.X = csr_matrix(adata.X)
    return adata


[docs]def large_txt_to_mtx(int_file,out_folder,gene_is_index=True,type_convert_to='int16',n_lines=None,sep='\t'):  # whether the txt if gene * cell
    '''
    Given a large txt dense expression file, convert them to mtx file on cluster to facilitate future I/O

    :param int_file: string, path to the intput txt file, delimited by tab
    :param out_folder: string, path to the output folder where the mtx file will be stored
    :param gene_is_index: boolean, whether the gene/features is the index in the int_file.
    :param type_convert_to: string, since it is a large dataframe, need to read in chunk, to accelarate it and reduce the memory footprint,
                            we convert it to either 'int16' if original data is count, or 'float32' if original data is normalized data.
    :param n_lines: int, the number of lines for the ``int_file``, it is just for informative progress bar
    :param sep: string, defalut is tab, can be other as well
    
    Examples::

        from sctriangulate.preprocessing import large_txt_to_mtx
        large_txt_to_mtx(int_file='input.txt',out_folder='./data',gene_is_index=False,type_convert_to='float32')
    ''' 
    chunksize = 1000
    reader = pd.read_csv(int_file,sep=sep,index_col=0,chunksize=chunksize)
    store = []
    if n_lines is None:
        for chunk in tqdm(reader):
            tmp = chunk.astype(type_convert_to)
            store.append(tmp)
    else:
        total = n_lines // chunksize + 1
        for chunk in tqdm(reader,total=total):
            tmp = chunk.astype(type_convert_to)
            store.append(tmp)

    data = pd.concat(store)
    '''save as mtx, now!!!'''
    if not os.path.exists(out_folder):
        os.mkdir(out_folder)
    if gene_is_index:
        data.index.to_series().to_csv(os.path.join(out_folder,'genes.tsv'),sep='\t',header=None,index=None)
        data.columns.to_series().to_csv(os.path.join(out_folder,'barcodes.tsv'),sep='\t',header=None,index=None)
        mmwrite(os.path.join(out_folder,'matrix.mtx'),csr_matrix(data.values))
    else:
        data.columns.to_series().to_csv(os.path.join(out_folder,'genes.tsv'),sep='\t',header=None,index=None)
        data.index.to_series().to_csv(os.path.join(out_folder,'barcodes.tsv'),sep='\t',header=None,index=None)
        mmwrite(os.path.join(out_folder,'matrix.mtx'),csr_matrix(data.values.T))        


[docs]def mtx_to_adata(int_folder,gene_is_index=True,feature='genes.tsv',feature_col='index',barcode='barcodes.tsv',barcode_col='index',matrix='matrix.mtx'):  # whether the mtx file is gene * cell
    '''
    convert mtx file to adata in RAM, make sure the X is sparse. Gzip file is supported as well, program will automatically detect gz extension
    and use gzip pacakge to read it into memory

    :param int_folder: string, folder where the mtx files are stored.
    :param gene_is_index: boolean, whether the gene is index.
    :param feature: string, the name of the feature file, default for rna is genes.tsv
    :param feature_col: 'index' as index, or a int (which column, python is zero based) to use in your feature.tsv as feature
    :param barcode: string, the name of the barcode file, default is barcodes.tsv
    :param barcode_col: 'index' as index, or a int (which column, python is zero based) to use in your barcodes.tsv as barcode
    :param matrix: string, the name of the sparse matrix

    :return: AnnData

    Examples::

        from sctriangulate.preprocessing import mtx_to_adata
        mtx_to_adata(int_folder='./data',gene_is_index=False,feature='genes')

    '''

    # read gene
    if feature.endswith('.gz'):
        with gzip.open(os.path.join(int_folder,feature),'rb') as f:
            file_content = f.read()   # bytestring
        stuff_to_read = BytesIO(file_content)
    else:
        stuff_to_read = os.path.join(int_folder,feature)
    if feature_col == 'index':
        gene = pd.read_csv(stuff_to_read,sep='\t',index_col=0,header=None).index
    else:
        gene = pd.read_csv(stuff_to_read,sep='\t',index_col=0,header=None)[feature_col]

    # read barcode
    if feature.endswith('.gz'):
        with gzip.open(os.path.join(int_folder,barcode),'rb') as f:
            file_content = f.read()  # bytestring
        stuff_to_read = BytesIO(file_content)
    else:
        stuff_to_read = os.path.join(int_folder,barcode)
    if barcode_col == 'index':
        cell = pd.read_csv(stuff_to_read,sep='\t',index_col=0,header=None).index
    else:
        cell = pd.read_csv(stuff_to_read,sep='\t',index_col=0,header=None)[barcode_col]

    # read value
    if matrix.endswith('.gz'):
        with gzip.open(os.path.join(int_folder,matrix),'rb') as f:
            file_content = f.read() # bytestring
        stuff_to_read = BytesIO(file_content)
    else:
        stuff_to_read = os.path.join(int_folder,matrix)
    value = csr_matrix(mmread(stuff_to_read))
    if gene_is_index:
        value = value.T
        adata = ad.AnnData(X=value,obs=pd.DataFrame(index=cell),var=pd.DataFrame(index=gene))
    else:
        adata = ad.AnnData(X=value,obs=pd.DataFrame(index=cell),var=pd.DataFrame(index=gene))
    adata.var.index.name = None
    adata.var_names_make_unique()
    return adata


[docs]def mtx_to_large_txt(int_folder,out_file,gene_is_index=False):
    '''
    convert mtx back to large dense txt expression dataframe.

    :param int_folder: string, path to the input mtx folder.
    :param out_file: string, path to the output txt file.
    :param gene_is_index: boolean, whether the gene is the index.

    Examples::

        from sctriangulate.preprocessing import mtx_to_large_txt
        mtx_to_large_txt(int_folder='./data',out_file='input.txt',gene_is_index=False)

    '''
    gene = pd.read_csv(os.path.join(int_folder,'genes.tsv'),sep='\t',index_col=0,header=None).index
    cell = pd.read_csv(os.path.join(int_folder,'barcodes.tsv'),sep='\t',index_col=0,header=None).index
    value = mmread(os.path.join(int_folder,'matrix.mtx')).toarray()
    if gene_is_index:
        data = pd.DataFrame(data=value,index=gene,columns=cell)
    else:
        data = pd.DataFrame(data=value.T,index=cell,columns=gene)
    data.to_csv(out_file,sep='\t',chunksize=1000)


[docs]def adata_to_mtx(adata,gene_is_index=True,var_column=None,obs_column=None,outdir='data'):
    '''
    write a adata to mtx file

    :param adata: AnnData, the adata to convert
    :param gene_is_index: boolean, for the resultant mtx, will gene be the index or feature is the index
    :param var_column: list, the var columns to write the genes.tsv, None will write all available columns
    :param obs_column: list, the obs columns to write the barcodes.tsv, None will write all available columns
    :param outdir: string, the name of the mtx folder, default is data

    Example::

        adata_to_mtx(adata,True,None,None,'data')
    '''
    # create folder if not exist
    if not os.path.exists(outdir):
        os.mkdir(outdir)
    # write genes.tsv
    if var_column is None:
        var = adata.var_names.to_series()
    else:
        var = adata.var[var_column]
    var.to_csv(os.path.join(outdir,'genes.tsv'),sep='\t',header=None,index=None)
    # write barcodes.tsv
    if obs_column is None:
        obs = adata.obs_names.to_series()
    else:
        obs = adata.obs[obs_column]
    obs.to_csv(os.path.join(outdir,'barcodes.tsv'),sep='\t',header=None,index=None)
    # write matrix.mtx
    if not gene_is_index:
        mmwrite(os.path.join(outdir,'matrix.mtx'),make_sure_mat_sparse(adata.X))
    else:
        mmwrite(os.path.join(outdir,'matrix.mtx'),make_sure_mat_sparse(adata.X).transpose())



[docs]def add_azimuth(adata,result,name='predicted.celltype.l2'):
    '''
    a convenient function if you have azimuth predicted labels in hand, and want to add the label to the adata.

    :param adata: AnnData
    :param result: string, the path to the 'azimuth_predict.tsv' file
    :param name: string, the column name where the user want to transfer to the adata.

    Examples::

        from sctriangulate.preprocessing import add_azimuth
        add_azimuth(adata,result='./azimuth_predict.tsv',name='predicted.celltype.l2')

    '''
    azimuth = pd.read_csv(result,sep='\t',index_col=0)
    azimuth_map = azimuth[name].to_dict()
    azimuth_prediction = azimuth['{}.score'.format(name)].to_dict()
    azimuth_mapping = azimuth['mapping.score'].to_dict()
    adata.obs['azimuth'] = adata.obs_names.map(azimuth_map).values
    adata.obs['prediction_score'] = adata.obs_names.map(azimuth_prediction).values
    adata.obs['mapping_score'] = adata.obs_names.map(azimuth_mapping).values

[docs]def add_annotations(adata,inputs,cols_input,index_col=0,cols_output=None,sep='\t',kind='disk'):
    '''
    Adding annotations from external sources to the adata

    :param adata: Anndata
    :param inputs: string, path to the txt file where the barcode to cluster label information is stored.
    :param cols_input: list, what columns the users want to transfer to the adata.
    :param index_col: int, for the input, which column will serve as the index column
    :param cols_output: list, corresponding to the cols_input, how these columns will be named in the adata.obs columns
    :param sep: string, default is tab
    :param kind: a string, either 'disk', or 'memory', disk means the input is the path to the text file, 'memory' means the input is the
                variable name in the RAM that represents the dataframe

    Examples::

        from sctriangulate.preprocessing import add_annotations
        add_annotations(adata,inputs='./annotation.txt',cols_input=['col1','col2'],index_col=0,cols_output=['annotation1','annontation2'],kind='disk')
        add_annotations(adata,inputs=df,cols_input=['col1','col2'],index_col=0,cols_output=['annotation1','annontation2'],kind='memory')

    '''
    # means a single file such that one column is barcodes, annotations are within other columns
    if kind == 'disk':
        annotations = pd.read_csv(inputs,sep=sep,index_col=index_col).loc[:,cols_input]
    elif kind == 'memory':   # index_col will be ignored
        annotations = inputs.loc[:,cols_input]
    mappings = []
    for col in cols_input:
        mapping = annotations[col].to_dict()
        mappings.append(mapping)
    if cols_output is None:
        for i,col in enumerate(cols_input):
            adata.obs[col] = adata.obs_names.map(mappings[i]).fillna('Unknown').values
            adata.obs[col] = adata.obs[col].astype('str').astype('category')
    else:
        for i in range(len(cols_input)):
            adata.obs[cols_output[i]] = adata.obs_names.map(mappings[i]).fillna('Unknown').values
            adata.obs[cols_output[i]] = adata.obs[cols_output[i]].astype('str').astype('category')


[docs]def add_umap(adata,inputs,mode,cols=None,index_col=0,key='X_umap'):
    '''
    if umap embedding is pre-computed, add it back to adata object.

    :param adata: Anndata
    :param inputs: string, path to the the txt file where the umap embedding was stored.
    :param mode: string, valid value 'pandas_disk', 'pandas_memory', 'numpy'

        * **pandas_disk**: the `inputs` argument should be the path to the txt file
        * **pandas_memory**: the `inputs` argument should be the name of the pandas dataframe in the program, inputs=df
        * **numpy**, the `inputs` argument should be a 2D ndarray contains pre-sorted (same order as barcodes in adata) umap coordinates

    :param cols: list, what columns contain umap embeddings
    :param index_col: int, which column will serve as the index column.
    :param key: string, the key name to add to obsm

    Examples::

        from sctriangulate.preprocessing import add_umap
        add_umap(adata,inputs='umap.txt',mode='pandas_disk',cols=['umap1','umap2'],index_col=0)

    '''
    # make sure cols are [umap_x, umap_y]
    if mode == 'pandas_disk':
        df = pd.read_csv(inputs,sep='\t',index_col=index_col)
        umap_x = df[cols[0]].to_dict()
        umap_y = df[cols[1]].to_dict()
        adata.obs['umap_x'] = adata.obs_names.map(umap_x).values
        adata.obs['umap_y'] = adata.obs_names.map(umap_y).values
        adata.obsm[key] = adata.obs.loc[:,['umap_x','umap_y']].values
        adata.obs.drop(columns=['umap_x','umap_y'],inplace=True)
    elif mode == 'pandas_memory':
        df = inputs
        umap_x = df[cols[0]].to_dict()
        umap_y = df[cols[1]].to_dict()
        adata.obs['umap_x'] = adata.obs_names.map(umap_x).values
        adata.obs['umap_y'] = adata.obs_names.map(umap_y).values
        adata.obsm[key] = adata.obs.loc[:,['umap_x','umap_y']].values
        adata.obs.drop(columns=['umap_x','umap_y'],inplace=True)
    elif mode == 'numpy':  # assume the order is correct
        adata.obsm[key] = inputs

[docs]def doublet_predict(adata):  # gave RNA count or log matrix
    '''
    wrapper function for running srublet, a new column named 'doublet_scores' will be added to the adata

    :param adata: Anndata

    :return: dict

    Examples::

        from sctriangulate.preprocessing import doublet_predict
        mapping = doublet_predict(old_adata)

    '''
    from scipy.sparse import issparse
    import scrublet as scr
    if issparse(adata.X):
        adata.X = adata.X.toarray()
    counts_matrix = adata.X
    scrub = scr.Scrublet(counts_matrix)
    doublet_scores, predicted_doublets = scrub.scrub_doublets(min_counts=1, min_cells=1)
    adata.obs['doublet_scores'] = doublet_scores
    return adata.obs['doublet_scores'].to_dict()

[docs]def make_sure_adata_writable(adata,delete=False):
    '''
    maks sure the adata is able to write to disk, since h5 file is stricted typed, so no mixed dtype is allowd.
    this function basically is to detect the column of obs/var that are of mixed types, and delete them.

    :param adata: Anndata
    :param delete: boolean, False will just print out what columns are mixed type, True will automatically delete those columns

    :return: Anndata

    Examples::

        from sctriangulate.preprocessing import make_sure_adata_writable
        make_sure_adata_writable(adata,delete=True)


    '''
    # check index, can not have name
    var_names = adata.var_names
    obs_names = adata.obs_names
    var_names.name = None
    obs_names.name = None
    adata.var_names = var_names
    adata.obs_names = obs_names

    # make sure column is pure type, basically, if mixed tyep, delete the column, and print out the delete columns
    # go to: https://github.com/theislab/scanpy/issues/1866

    var = adata.var
    obs = adata.obs
    
    for col in var.columns:
        if var[col].dtypes == 'O':
            all_type = np.array([type(item) for item in var[col]])
            first = all_type[0]
            if (first==all_type).all() and first == str:   # object, but every item is str
                continue
            else:   # mixed type
                print('column {} in var will be deleted, because mixed types'.format(col))
                if delete:
                    adata.var.drop(columns=[col],inplace=True)
    
    for col in obs.columns:
        if obs[col].dtypes == 'O':
            all_type = np.array([type(item) for item in obs[col]])
            first = all_type[0]
            if (first==all_type).all() and first == str:   # object, but every item is str
                continue
            else:   # mixed type
                print('column {} in obs will be deleted, because mixed types'.format(col))
                if delete:
                    adata.obs.drop(columns=[col],inplace=True)

    return adata


[docs]def scanpy_recipe(adata,species='human',is_log=False,resolutions=[1,2,3],modality='rna',umap=True,save=True,pca_n_comps=None,n_top_genes=3000):
    '''
    Main preprocessing function. Run Scanpy normal pipeline to achieve Leiden clustering with various resolutions across multiple modalities.

    :param adata: Anndata
    :param species: string, 'human' or 'mouse'
    :param is_log: boolean, whether the adata.X is count or normalized data.
    :param resolutions: list, what leiden resolutions the users want to obtain.
    :param modality: string, valid values: 'rna','adt','atac', 'binary'[mutation data, TCR data, etc]
    :param umap: boolean, whether to compute umap embedding.
    :param save: boolean, whether to save the obtained adata object with cluster label information in it.
    :param pca_n_comps: int, how many PCs to keep when running PCA. Suggestion: RNA (30-50), ADT (15), ATAC (100)
    :param n_top_genes: int, how many features to keep when selecting highly_variable_genes. Suggestion: RNA (3000), ADT (ignored), ATAC (50000-100000)

    :return: Anndata

    Examples::

        from sctriangulate.preprocessing import scanpy_recipe
        # rna
        adata = scanpy_recipe(adata,is_log=False,resolutions=[1,2,3],modality='rna',pca_n_comps=50,n_top_genes=3000)
        # adt
        adata = scanpy_recipe(adata,is_log=False,resolutions=[1,2,3],modality='adt',pca_n_comps=15)
        # atac
        adata = scanpy_recipe(adata,is_log=False,resolutions=[1,2,3],modality='atac',pca_n_comps=100,n_top_genes=100000)
        # binary
        adata = scanpy_recipe(adata,resolutions=[1,2,3],modality='binary')

    '''
    adata.var_names_make_unique()
    # normal analysis
    if modality == 'rna':
        if not is_log:   # count data
            if species == 'human':
                adata.var['mt'] = adata.var_names.str.startswith('MT-')
            elif species == 'mouse':
                adata.var['mt'] = adata.var_names.str.startswith('mt-')
            sc.pp.calculate_qc_metrics(adata,qc_vars=['mt'],percent_top=None,inplace=True,log1p=False)
            sc.pp.normalize_total(adata,target_sum=1e4)
            sc.pp.log1p(adata)
            sc.pp.highly_variable_genes(adata,flavor='seurat',n_top_genes=n_top_genes)
            adata.raw = adata
            adata = adata[:,adata.var['highly_variable']]
            sc.pp.regress_out(adata,['total_counts','pct_counts_mt'])
            sc.pp.scale(adata,max_value=10)
            sc.tl.pca(adata,n_comps=pca_n_comps)
            sc.pp.neighbors(adata)
            for resolution in resolutions:
                sc.tl.leiden(adata,resolution=resolution,key_added='sctri_{}_leiden_{}'.format(modality,resolution))
            if umap:
                sc.tl.umap(adata)
            # put raw back to X, and make sure it is sparse matrix
            adata = adata.raw.to_adata()
            if not issparse(adata.X):
                adata.X = csr_matrix(adata.X)
            if save:
                resolutions = '_'.join([str(item) for item in resolutions])
                adata.write('adata_after_scanpy_recipe_{}_{}_umap_{}.h5ad'.format(modality,resolutions,umap))
 

        else:   # log(1+x) and depth normalized data
            if species == 'human':
                adata.var['mt'] = adata.var_names.str.startswith('MT-')
            elif species == 'mouse':
                adata.var['mt'] = adata.var_names.str.startswith('mt-')
            sc.pp.calculate_qc_metrics(adata,qc_vars=['mt'],percent_top=None,inplace=True,log1p=False)
            sc.pp.highly_variable_genes(adata,flavor='seurat',n_top_genes=n_top_genes)
            adata.raw = adata
            adata = adata[:,adata.var['highly_variable']]
            sc.pp.regress_out(adata,['total_counts','pct_counts_mt'])
            sc.pp.scale(adata,max_value=10)
            sc.tl.pca(adata,n_comps=pca_n_comps)
            sc.pp.neighbors(adata)
            for resolution in resolutions:
                sc.tl.leiden(adata,resolution=resolution,key_added='sctri_{}_leiden_{}'.format(modality,resolution))
            if umap:
                sc.tl.umap(adata)
            # put raw back to X, and make sure it is sparse matrix
            adata = adata.raw.to_adata()
            if not issparse(adata.X):
                adata.X = csr_matrix(adata.X)
            if save:
                resolutions = '_'.join([str(item) for item in resolutions])
                adata.write('adata_after_scanpy_recipe_{}_{}_umap_{}.h5ad'.format(modality,resolutions,umap))

    elif modality == 'atac':
        if not is_log:
            sc.pp.calculate_qc_metrics(adata,percent_top=None,inplace=True,log1p=False)
            sc.pp.normalize_total(adata,target_sum=1e4)
            sc.pp.log1p(adata)
            sc.pp.highly_variable_genes(adata,flavor='seurat',n_top_genes=n_top_genes)
            adata.raw = adata
            adata = adata[:,adata.var['highly_variable']]
            #sc.pp.scale(adata,max_value=10)   # because in episcanpy toturial, it seems to be ignored
            sc.tl.pca(adata,n_comps=pca_n_comps)
            sc.pp.neighbors(adata)
            for resolution in resolutions:
                sc.tl.leiden(adata,resolution=resolution,key_added='sctri_{}_leiden_{}'.format(modality,resolution))
            if umap:
                sc.tl.umap(adata)
            adata = adata.raw.to_adata()
            if not issparse(adata.X):
                adata.X = csr_matrix(adata.X)
            if save:
                resolutions = '_'.join([str(item) for item in resolutions])
                adata.write('adata_after_scanpy_recipe_{}_{}_umap_{}.h5ad'.format(modality,resolutions,umap))
        else:
            sc.pp.calculate_qc_metrics(adata,percent_top=None,inplace=True,log1p=False)
            sc.pp.highly_variable_genes(adata,flavor='seurat',n_top_genes=n_top_genes)
            adata.raw = adata
            adata = adata[:,adata.var['highly_variable']]
            #sc.pp.scale(adata,max_value=10)
            sc.tl.pca(adata,n_comps=pca_n_comps)
            sc.pp.neighbors(adata)
            for resolution in resolutions:
                sc.tl.leiden(adata,resolution=resolution,key_added='sctri_{}_leiden_{}'.format(modality,resolution))
            if umap:
                sc.tl.umap(adata)
            adata = adata.raw.to_adata()
            if not issparse(adata.X):
                adata.X = csr_matrix(adata.X)
            if save:
                resolutions = '_'.join([str(item) for item in resolutions])
                adata.write('adata_after_scanpy_recipe_{}_{}_umap_{}.h5ad'.format(modality,resolutions,umap))

    elif modality == 'adt':
        if not is_log:
            sc.pp.calculate_qc_metrics(adata,percent_top=None,inplace=True,log1p=False)
            adata.X = make_sure_mat_sparse(Normalization.CLR_normalization(make_sure_mat_dense(adata.X)))
            sc.tl.pca(adata,n_comps=pca_n_comps)
            sc.pp.neighbors(adata)
            for resolution in resolutions:
                sc.tl.leiden(adata,resolution=resolution,key_added='sctri_{}_leiden_{}'.format(modality,resolution))
            if umap:
                sc.tl.umap(adata)
            if not issparse(adata.X):
                adata.X = csr_matrix(adata.X)
            if save:
                resolutions = '_'.join([str(item) for item in resolutions])
                adata.write('adata_after_scanpy_recipe_{}_{}_umap_{}.h5ad'.format(modality,resolutions,umap))
        else:
            sc.tl.pca(adata,n_comps=pca_n_comps)
            sc.pp.neighbors(adata)
            for resolution in resolutions:
                sc.tl.leiden(adata,resolution=resolution,key_added='sctri_{}_leiden_{}'.format(modality,resolution))
            if umap:
                sc.tl.umap(adata)
            if not issparse(adata.X):
                adata.X = csr_matrix(adata.X)
            if save:
                resolutions = '_'.join([str(item) for item in resolutions])
                adata.write('adata_after_scanpy_recipe_{}_{}_umap_{}.h5ad'.format(modality,resolutions,umap))

    elif modality == 'binary':  # mutation 
        #sc.tl.pca(adata,n_comps=pca_n_comps)
        sc.pp.neighbors(adata,use_rep='X',metric='jaccard')
        for resolution in resolutions:
            sc.tl.leiden(adata,resolution=resolution,key_added='sctri_{}_leiden_{}'.format(modality,resolution))
        if umap:
            sc.tl.umap(adata)
        if not issparse(adata.X):
            adata.X = csr_matrix(adata.X)
        if save:
            resolutions = '_'.join([str(item) for item in resolutions])
            adata.write('adata_after_scanpy_recipe_{}_{}_umap_{}.h5ad'.format(modality,resolutions,umap))

    elif modality == 'spatial':
        sc.pp.scale(adata)
        sc.pp.neighbors(adata)
        for resolution in resolutions:
            sc.tl.leiden(adata,resolution=resolution,key_added='sctri_{}_leiden_{}'.format(modality,resolution))
        if save:
            resolutions = '_'.join([str(item) for item in resolutions])
            adata.write('adata_after_scanpy_recipe_{}_{}_umap_{}.h5ad'.format(modality,resolutions,False))        

    return adata
    




[docs]def concat_rna_and_other(adata_rna,adata_other,umap,umap_key,name,prefix):
    '''
    concatenate rna adata and another modality's adata object

    :param adata_rna: AnnData
    :param adata_other: Anndata
    :param umap: string, whose umap to use, either 'rna' or 'other'
    :param umap_key: string, either 'X_umap' or 'X_tsne' or other string for obsm key
    :param name: string, the name of other modality, for example, 'adt' or 'atac'
    :param prefix: string, the prefix added in front of features from other modality, by scTriangulate convertion, adt will be 'AB_', atac will be ''.

    :return adata_combine: Anndata

    Examples::

        from sctriangulate.preprocessing import concat_rna_and_other
        concat_rna_and_other(adata_rna,adata_adt,umap='rna',name='adt',prefix='AB_') 

    '''
    adata_rna = adata_rna.copy()
    adata_other = adata_other.copy()
    # remove layers, [!obsm], varm, obsp, varp, raw
    for adata in [adata_rna,adata_other]:
        del adata.layers
        del adata.varm
        del adata.obsp
        del adata.varp
        del adata.raw
    adata_other = adata_other[adata_rna.obs_names,:]  # make sure the obs order is the same
    adata_other.var_names = [prefix + item for item in adata_other.var_names]
    adata_combine = ad.concat([adata_rna,adata_other],axis=1,join='outer',merge='first',label='modality',keys=['rna','{}'.format(name)])
    if umap == 'rna':
        adata_combine.obsm[umap_key] = adata_rna.obsm[umap_key]
    elif umap == 'other':
        adata_combine.obsm[umap_key] = adata_other.obsm[umap_key]
    if not issparse(adata_combine.X):
        adata_combine.X = csr_matrix(adata_combine.X)
    return adata_combine


[docs]def nca_embedding(adata,nca_n_components,label,method,max_iter=50,plot=True,save=True,format='pdf',legend_loc='on data',n_top_genes=None,hv_features=None,add_features=None):
    '''
    Doing Neighborhood component ananlysis (NCA), so it is a supervised PCA that takes the label from the annotation, and try to generate a UMAP 
    embedding that perfectly separate the labelled clusters.

    :param adata: the Anndata
    :param nca_n_components: recommend to be 10 based on `Ref <https://www.nature.com/articles/s41586-021-03969-3>`_
    :param label: string, the column name which contains the label information
    :param method: either 'umap' or 'tsne'
    :param max_iter: for the NCA, default is 50, it is generally good enough
    :param plot: whether to plot the umap/tsne or not
    :param save: whether to save the plot or not
    :param format: the saved format, default is 'pdf'
    :param legend_loc: 'on data' or 'right margin'
    :param n_top_genes: how many hypervariable genes to choose for NCA, recommended 3000 or 5000, default is None, means there will be other features to add, multimodal setting
    :param hv_features: a list contains the user-supplied hypervariable genes/features, in multimodal setting, this can be [rna genes] + [ADT protein]
    :param add_features: this should be another adata contains features from other modalities, or None means just for RNA

    Example::

        from sctriangulate.preprocessing import nca_embedding
        # only RNA
        nca_embedding(adata,nca_n_components=10,label='annotation1',method='umap',n_top_genes=3000)
        # RNA + ADT
        # list1 contains [gene features that are variable] and [ADT features that are variable]
        nca_embedding(adata_rna,nca_n_components=10,label='annotation1',method='umap',n_top_genes=3000,hv_features=list1, add_features=adata_adt)
    '''
    from sklearn.neighbors import NeighborhoodComponentsAnalysis
    adata = adata
    if n_top_genes is not None:
        sc.pp.highly_variable_genes(adata,flavor='seurat',n_top_genes=n_top_genes)
    else:
        if add_features is not None:  # first add the features, input should be anndata
            adata = concat_rna_and_other(adata,add_features,umap='rna',umap_key='X_umap',name='add_features',prefix='add_features_')
        if hv_features is not None:    # custom hv
            tmp = pd.Series(index=adata.var_names,data=np.full(len(adata.var_names),fill_value=False))
            tmp.loc[hv_features] = True
            adata.var['highly_variable'] = tmp.values
    adata.raw = adata
    adata = adata[:,adata.var['highly_variable']]
    X = make_sure_mat_dense(adata.X)
    y = adata.obs[label].values
    nca = NeighborhoodComponentsAnalysis(n_components=nca_n_components,max_iter=max_iter)
    embed = nca.fit_transform(X,y)  # (n_cells,n_components)
    adata.obsm['X_nca'] = embed
    adata = adata.raw.to_adata()
    if method == 'umap':
        sc.pp.neighbors(adata,use_rep='X_nca')
        sc.tl.umap(adata)
        sc.pl.umap(adata,color=label,frameon=False,legend_loc=legend_loc)
        if save:
            plt.savefig(os.path.join('.','nca_embedding_{}_{}.{}'.format(label,method,format)),bbox_inches='tight')
            plt.close()
    elif method == 'tsne':
        sc.tl.tsne(adata,use_rep='X_nca')
        sc.pl.tsne(adata,color=label,frameon=False,legend_loc=legend_loc)
        if save:
            plt.savefig(os.path.join('.','nca_embedding_{}_{}.{}'.format(label,method,format)),bbox_inches='tight')
            plt.close()
    adata.X = make_sure_mat_sparse(adata.X)
    return adata



[docs]def umap_dual_view_save(adata,cols,method='umap'):
    '''
    generate a pdf file with two umap up and down, one is with legend on side, another is with legend on data.
    More importantly, this allows you to generate multiple columns iteratively.

    :param adata: Anndata
    :param cols: list, all columns from which we want to draw umap.
    :param method: string, either umap or tsne

    Examples::

        from sctriangulate.preprocessing import umap_dual_view_save
        umap_dual_view_save(adata,cols=['annotation1','annotation2','total_counts'])
    '''
    if method == 'umap':
        for col in cols:
            fig,ax = plt.subplots(nrows=2,ncols=1,figsize=(8,20),gridspec_kw={'hspace':0.3})  # for final_annotation
            sc.pl.umap(adata,color=col,frameon=False,ax=ax[0])
            sc.pl.umap(adata,color=col,frameon=False,legend_loc='on data',legend_fontsize=5,ax=ax[1])
            plt.savefig('umap_dual_view_{}.pdf'.format(col),bbox_inches='tight')
            plt.close()
    elif method == 'tsne':
        for col in cols:
            fig,ax = plt.subplots(nrows=2,ncols=1,figsize=(8,20),gridspec_kw={'hspace':0.3})  # for final_annotation
            sc.pl.tsne(adata,color=col,frameon=False,ax=ax[0])
            sc.pl.tsne(adata,color=col,frameon=False,legend_loc='on data',legend_fontsize=5,ax=ax[1])
            plt.savefig('tsne_dual_view_{}.pdf'.format(col),bbox_inches='tight')
            plt.close()      

[docs]def just_log_norm(adata):
    '''
    perform CPTT and log operation on adata.X in place, no return value

    :param adata: the anndata to perform operations on

    Example:

        just_log_norm(adata)
    '''
    sc.pp.normalize_total(adata,target_sum=1e4)
    sc.pp.log1p(adata)
    return adata

[docs]def format_find_concat(adata,canonical_chr_only=True,gtf_file='gencode.v38.annotation.gtf',key_added='gene_annotation',**kwargs):
    '''
    this is a wrapper function to add nearest genes to your ATAC peaks or bins. For instance, if the peak is chr1:55555-55566,
    it will be annotated as chr1:55555-55566_gene1;gene2
    :param adata: The anndata, the var_names is the peak/bin, please make sure the format is like chr1:55555-55566
    :param canonical_chr_only: boolean, default to True, means only contain features on canonical chromosomes. for human, it is chr1-22 and X,Y
    :param gtf_file: the path to the gtf files, we provide the hg38 on this `google drive link <https://drive.google.com/file/d/11gbJl2-wZr3LbpWaU9RiUAGPebqWYi1z/view?usp=sharing>`_ to download
    :param key_added: string, the column name where the gene annotation will be inserted to adata.var, default is 'gene_annotation'
    :return adata: Anndata, the gene annotation will be added to var, and the var_name will be suffixed with gene annotation, if canonical_chr_only is True, then only features on canonical 
                   chromsome will be retained.
    Example::
        adata = format_find_concat(adata)
    '''
    adata= reformat_peak(adata,canonical_chr_only=canonical_chr_only)
    find_genes(adata,gtf_file=gtf_file,key_added=key_added,**kwargs)
    adata.var_names = [name + '_' + gene for name,gene in zip(adata.var_names,adata.var[key_added])]
    return adata



[docs]class GeneConvert(object):
    '''
    A collection of gene symbol conversion functions.

    Now support:

    1. ensemblgene id to gene symbol.


    '''

[docs]    @staticmethod
    def ensemblgene_to_symbol(query,species):
        '''
        Examples::

            from sctriangulate.preprocessing import GeneConvert
            converted_list = GeneConvert.ensemblgene_to_symbol(['ENSG00000010404','ENSG00000010505'],species='human')
        '''
        # assume query is a list, will also return a list
        import mygene
        mg = mygene.MyGeneInfo()
        out = mg.querymany(query,scopes='ensemblgene',fileds='symbol',species=species,returnall=True,as_dataframe=True,df_index=True)
        result = out['out']['symbol'].fillna('unknown_gene').tolist()
        try:
            assert len(query) == len(result)
        except AssertionError:    # have duplicate results
            df = out['out']
            df_unique = df.loc[~df.index.duplicated(),:]
            result = df_unique['symbol'].fillna('unknown_gene').tolist()
        return result


def dual_gene_plot(adata,gene1,gene2,s=8,save=True,format='pdf',dir='.',umap_lim=None):
    from scipy.sparse import issparse
    if issparse(adata.X):
        adata.X = adata.X.toarray()
    index1 = np.where(adata.var_names == gene1)[0][0]
    index2 = np.where(adata.var_names == gene2)[0][0]
    exp1 = adata.X[:,index1]
    exp2 = adata.X[:,index2]
    color = []
    for i in range(len(exp1)):
        if exp1[i] > 0 and exp2[i] > 0:
            color.append('#F2DE77')
        elif exp1[i] > 0 and exp2[i] == 0:
            color.append('#5ABF9A')
        elif exp1[i] == 0 and exp2[i] > 0:
            color.append('#F25C69')
        else:
            color.append('lightgrey')
    fig, ax = plt.subplots()
    if umap_lim is not None:
        ax.set_xlim(umap_lim[0])
        ax.set_ylim(umap_lim[1])
    ax.scatter(x=adata.obsm['X_umap'][:,0],y=adata.obsm['X_umap'][:,1],s=s,c=color)
    import matplotlib.lines as mlines
    ax.legend(handles=[mlines.Line2D([],[],marker='o',color=i,linestyle='') for i in ['#F2DE77','#5ABF9A','#F25C69','lightgrey']],
              labels=['Both','{}'.format(gene1),'{}'.format(gene2),'None'],frameon=False,loc='upper left',bbox_to_anchor=[1,1])
    if save:
        plt.savefig(os.path.join(dir,'sctri_dual_gene_plot_{}_{}.{}'.format(gene1,gene2,format)),bbox_inches='tight')
        plt.close()
    return ax


def multi_gene_plot(adata,genes,s=8,save=True,format='pdf',dir='.',umap_lim=None):
    from scipy.sparse import issparse
    if issparse(adata.X):
        adata.X = adata.X.toarray()
    exp_list = []
    for gene in genes:
        index_gene = np.where(adata.var_names == gene)[0][0]
        exp_gene = adata.X[:,index_gene]
        exp_list.append(exp_gene)
    color = []
    for i in range(len(exp_list[0])):
        if len(genes) == 3:
            c = ['#04BFBF','#83A603','#F7766D']
        elif len(genes) == 4:
            c = ['#04BFBF','#83A603','#F7766D','#E36DF2']
        elif len(genes) == 5:
            c = ['#04BFBF','#83A603','#F7766D','#E36DF2','#A69B03']                       
        b = '#BABABA'
        l_exp = np.array([exp[i] for exp in exp_list])
        n_exp = np.count_nonzero(l_exp > 0)
        if n_exp > 1:
            color.append(c[np.where(l_exp==l_exp.max())[0][0]])
        elif n_exp == 1:
            color.append(c[np.where(l_exp>0)[0][0]])
        elif n_exp == 0:
            color.append(b)
    fig, ax = plt.subplots()
    if umap_lim is not None:
        ax.set_xlim(umap_lim[0])
        ax.set_ylim(umap_lim[1])
    ax.scatter(x=adata.obsm['X_umap'][:,0],y=adata.obsm['X_umap'][:,1],s=s,c=color)
    import matplotlib.lines as mlines
    ax.legend(handles=[mlines.Line2D([],[],marker='o',color=i,linestyle='') for i in c+[b]],
              labels=genes + ['None'],frameon=False,
              loc='upper left',bbox_to_anchor=[1,1])
    if save:
        output = '_'.join(genes)
        plt.savefig(os.path.join(dir,'sctri_multi_gene_plot_{}.{}'.format(output,format)),bbox_inches='tight')
        plt.close()
    return ax


[docs]def make_sure_mat_dense(mat):
    '''
    make sure a matrix is dense

    :param mat: ndarary

    :return mat: ndarray (dense)

    Examples::

        mat = make_sure_mat_dense(mat)
    '''
    if not issparse(mat):
        pass
    else:
        mat = mat.toarray()
    return mat

[docs]def make_sure_mat_sparse(mat):  # will be csr if the input mat is a dense array
    '''
    make sure a matrix is sparse

    :param mat: ndarary

    :return mat: ndarray (sparse)

    Examples::

        mat = make_sure_mat_dense(mat)
    '''
    if not issparse(mat):
        mat = csr_matrix(mat)
    else:
        pass
    return mat

[docs]class Normalization(object):
    '''
    a series of Normalization functions

    Now support:

    1. CLR normalization
    2. total count normalization (CPTT, CPM)
    3. GMM normalization



    '''
    # matrix should be cell x feature, expecting a ndarray

[docs]    @staticmethod
    def CLR_normalization(mat):
        '''
        Examples::

            from sctriangulate.preprocessing import Normalization
            post_mat = Normalization.CLR_normalization(pre_mat)
        '''
        from scipy.stats import gmean
        gmeans = gmean(mat+1,axis=1).reshape(-1,1)
        post = np.log(mat/gmeans + 1)
        return post

[docs]    @staticmethod
    def total_normalization(mat,target=1e4):
        '''
        Examples::

            from sctriangulate.preprocessing import Normalization
            post_mat = Normalization.total_normalization(pre_mat)
        '''
        if target is None:
            target = np.mean(mat,axis=1).reshape(-1,1)
        total = np.sum(mat,axis=1).reshape(-1,1)
        sf = total/target
        post = np.log(mat/sf + 1)
        return post

[docs]    @staticmethod
    def GMM_normalization(mat,non_negative=False):
        '''
        This method is a re-implementaion from `Stephenson et al <https://www.nature.com/articles/s41591-021-01329-2>`_,
        The raw counts are first subjected to a CPTT total normalization, then a GaussianMixture model was fitted to the data,
        we substract the mean of the background from the data to remove background noise. Optionally, user can make the post-processed
        matrix as non-negative by setting ``non_negative==True``

        Examples::

            from sctriangulate.preprocessing import Normalization
            post_mat = Normalization.GMM_normalization(pre_mat)
        '''
        mat = Normalization.total_normalization(mat)
        from sklearn.mixture import GaussianMixture
        model = GaussianMixture(n_components=2,random_state=0)
        model.fit(mat)
        means = model.means_  # (n_components,n_features)
        bg_index = np.argmin(means.mean(axis=1))
        bg_mean = means[bg_index,:].reshape(1,-1)
        post = mat - bg_mean
        if non_negative:
            post = np.where(post>=0,post,0)
        return post


[docs]def gene_activity_count_matrix_new_10x(fall_in_promoter,fall_in_gene,valid=None):
    '''
    Full explanation please refer to ``gene_activity_count_matrix_old_10x``

    Examples::

        from sctriangulate.preprocessing import gene_activity_count_matrix_new_10x
        gene_activity_count_matrix_new_10x(fall_in_promoter,fall_in_gene,valid=None)       
    '''

    gene_promoter = pd.read_csv(fall_in_promoter,sep='\t',header=None)
    gene_body = pd.read_csv(fall_in_gene,sep='\t',header=None)
    bucket = []
    for i in range(gene_promoter.shape[0]):
        row = gene_promoter.iloc[i]
        in_gene = row[3]
        in_barcode = row[6]
        in_count = row[7]
        try:
            in_barcode = in_barcode.split(';')
            in_count = [int(item) for item in in_count.split(';')]
        except AttributeError:  # means no fragments fall into the promoter
            continue
        # tmp will be three column, barcode, count, gene, no index
        tmp = pd.DataFrame({'barcode':in_barcode,'count':in_count}).groupby(by='barcode')['count'].sum().to_frame()
        tmp['gene'] = np.full(shape=tmp.shape[0],fill_value=in_gene)
        tmp.reset_index(inplace=True)
        bucket.append(tmp)
    for i in range(gene_body.shape[0]):
        row = gene_body.iloc[i]
        in_gene = row[3]
        in_barcode = row[6]
        in_count = row[7]
        try:
            in_barcode = in_barcode.split(';')
            in_count = [int(item) for item in in_count.split(';')]
        except AttributeError:  # means no fragments fall into the promoter
            continue
        # tmp will be three column, barcode, count, gene, no index
        tmp = pd.DataFrame({'barcode':in_barcode,'count':in_count}).groupby(by='barcode')['count'].sum().to_frame()
        tmp['gene'] = np.full(shape=tmp.shape[0],fill_value=in_gene)
        tmp.reset_index(inplace=True)
        bucket.append(tmp)
    df = pd.concat(bucket)
    if valid is not None:
        df = df.loc[df['barcode'].isin(valid),:]
    final = df.groupby(by=['barcode','gene'])['count'].sum().unstack(fill_value=0)
    return final

    

[docs]def gene_activity_count_matrix_old_10x(fall_in_promoter,fall_in_gene,valid=None):
    '''
    this function is to generate gene activity count matrix, please refer to ``gene_activity_count_matrix_new_10x`` for latest
    version of 10x fragements.tsv output.

    how to get these two arguments? (LIGER team approach)


    1. sort the fragment, gene and promoter bed, or use function in this module to sort the reference bed files::

        sort -k1,1 -k2,2n -k3,3n pbmc_granulocyte_sorted_10k_atac_fragments.tsv > atac_fragments.sort.bed
        sort -k 1,1 -k2,2n -k3,3n hg19_genes.bed > hg19_genes.sort.bed
        sort -k 1,1 -k2,2n -k3,3n hg19_promoters.bed > hg19_promoters.sort.bed

    2. bedmap::

        module load bedops
        bedmap --ec --delim "\t" --echo --echo-map-id hg19_promoters.sort.bed atac_fragments.sort.bed > atac_promoters_bc.bed
        bedmap --ec --delim "\t" --echo --echo-map-id hg19_genes.sort.bed atac_fragments.sort.bed > atac_genes_bc.bed


    the following was taken from http://htmlpreview.github.io/?https://github.com/welch-lab/liger/blob/master/vignettes/Integrating_scRNA_and_scATAC_data.html

    * **delim**. This changes output delimiter from ‘|’ to indicated delimiter between columns, which in our case is “\t”.

    * **ec**. Adding this will check all problematic input files.

    * **echo**. Adding this will print each line from reference file in output. The reference file in our case is gene or promoter index.

    * **echo-map-id**. Adding this will list IDs of all overlapping elements from mapping files, which in our case are cell barcodes from fragment files.

    3. Finally::

        from sctriangulate.preprocessing import gene_activity_count_matrix_old_10x
        gene_activity_count_matrix_old_10x(fall_in_promoter,fall_in_gene,valid=None)
   
    '''
    gene_promoter = pd.read_csv(fall_in_promoter,sep='\t',header=None)
    gene_body = pd.read_csv(fall_in_gene,sep='\t',header=None)
    bucket = []
    for i in range(gene_promoter.shape[0]):
        row = gene_promoter.iloc[i]
        in_gene = row[3]
        in_barcode = row[6]
        try:
            in_barcode = in_barcode.split(';')
        except AttributeError:  # means no fragments fall into the promoter
            continue
        tmp = pd.Series(in_barcode).value_counts().to_frame(name='count')
        tmp['gene'] = np.full(shape=tmp.shape[0],fill_value=in_gene)
        tmp.reset_index(inplace=True)  # three column: index, count, gene
        bucket.append(tmp)
    for i in range(gene_body.shape[0]):
        row = gene_body.iloc[i]
        in_gene = row[3]
        in_barcode = row[6]
        try:
            in_barcode = in_barcode.split(';')
        except AttributeError:  # means no fragments fall into the promoter
            continue
        tmp = pd.Series(in_barcode).value_counts().to_frame(name='count')
        tmp['gene'] = np.full(shape=tmp.shape[0],fill_value=in_gene)
        tmp.reset_index(inplace=True)  # three column: index, count, gene
        bucket.append(tmp)
    df = pd.concat(bucket)
    if valid is not None:
        df = df.loc[df['index'].isin(valid),:]
    final = df.groupby(by=['index','gene'])['count'].sum().unstack(fill_value=0)
    return final


def gene_bed_to_promoter_bed(gene_bed_path,promoter_bed_path,up_bp=3000):
    gene_bed = pd.read_csv(gene_bed_path,header=None,sep='\t')
    with open(promoter_bed_path,'w') as f:
        for i in range(gene_bed.shape[0]):
            row = gene_bed.iloc[i]
            chro = row[0]
            start = row[1]
            end = row[2]
            name = row[3]
            score = row[4]
            strand = row[5]
            if strand == '+':
                new_start = start - up_bp
                new_end = start
            else:
                new_start = end
                new_end = end + up_bp
            f.write('{}\t{}\t{}\t{}\t{}\t{}\n'.format(chro,new_start,new_end,name,score,strand))


def ensembl_gtf_to_gene_bed(gtf_path,bed_path,sort=True):

    # only interested in gene feature
    hg38_gtf = pd.read_csv(gtf_path,skiprows=5,header=None,sep='\t')
    hg38_gtf_gene = hg38_gtf.loc[hg38_gtf[2]=='gene',:]

    # gotta have gene symbol
    col = []
    for i in range(hg38_gtf_gene.shape[0]):
        metadata = hg38_gtf_gene.iloc[i,8]
        if 'gene_name' in metadata:
            col.append(True)
        else:
            col.append(False)
    hg38_gtf_gene_have_symbol = hg38_gtf_gene.loc[col,:]

    # add biotype and gene name
    col1 = []
    col2 = []
    for i in range(hg38_gtf_gene_have_symbol.shape[0]):
        metadata = hg38_gtf_gene_have_symbol.iloc[i, 8]
        biotype = metadata.split('; ')[-1].split(' ')[-1].strip(';').strip('"')
        name = metadata.split('; ')[2].split(' ')[1].strip('"')
        col1.append(biotype)
        col2.append(name)
    hg38_gtf_gene_have_symbol['biotype'] = col1
    hg38_gtf_gene_have_symbol['name'] = col2

    # biotype has to be either protein_coding, IG or TR gene
    col = (hg38_gtf_gene_have_symbol['biotype']=='protein_coding') |\
          (hg38_gtf_gene_have_symbol['biotype']=='IG_C_gene') |\
          (hg38_gtf_gene_have_symbol['biotype']=='IG_D_gene') |\
          (hg38_gtf_gene_have_symbol['biotype']=='IG_J_gene') |\
          (hg38_gtf_gene_have_symbol['biotype']=='IG_V_gene') |\
          (hg38_gtf_gene_have_symbol['biotype']=='TR_C_gene') |\
          (hg38_gtf_gene_have_symbol['biotype']=='TR_D_gene') |\
          (hg38_gtf_gene_have_symbol['biotype']=='TR_J_gene') |\
          (hg38_gtf_gene_have_symbol['biotype']=='TR_V_gene')
    hg38_gtf_gene_have_symbol_biotype_correct = hg38_gtf_gene_have_symbol.loc[col,:]

    # chromsome need to be correct
    chr_need_chr = ['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21','22']
    chr_need_int = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22]
    chr_need_other = ['X','Y'] # don't include MT decause fragment.tsv file doesn't output that
    chr_need = chr_need_chr + chr_need_int + chr_need_other

    hg38_gtf_gene_have_symbol_biotype_correct_chr = hg38_gtf_gene_have_symbol_biotype_correct.loc[hg38_gtf_gene_have_symbol_biotype_correct[0].isin(chr_need),:]

    prefixed_chr = ['chr' + str(item) for item in hg38_gtf_gene_have_symbol_biotype_correct_chr[0]]
    hg38_gtf_gene_have_symbol_biotype_correct_chr[0] = prefixed_chr

    # get final result, BED6 format
    final = hg38_gtf_gene_have_symbol_biotype_correct_chr.loc[:,[0,3,4,'name',5,6]]
    if sort:
        '''
        equivalent to:
        sort -k1,1 -k2,2n -k3,3n gene.bed
        '''
        final.sort_values(by=[0,3,4],inplace=True)
    final.to_csv(bed_path,sep='\t',index=None,header=None)

# this function is taken from episcanpy, all the credits to the original developer:
# https://github.com/colomemaria/epiScanpy/blob/master/episcanpy/tools/_find_genes.py

# to download the gtf file
'''
https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_19/gencode.v19.annotation.gtf.gz
https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_38/gencode.v38.annotation.gtf.gz
'''
[docs]def find_genes(adata,
                 gtf_file,
                 key_added='gene_annotation',
                 upstream=2000,
                 downstream=0,
                 feature_type='gene',
                 annotation='HAVANA',
                 raw=False):
    """
    This function is taken from `episcanpy <https://github.com/colomemaria/epiScanpy/blob/master/episcanpy/tools/_find_genes.py>`_.
    all the credits to the original developer

    merge values of peaks/windows/features overlapping genebodies + 2kb upstream. It is possible to extend the search for closest gene to a given 
    number of bases downstream as well. There is commonly 2 set of annotations in a gtf file(HAVANA, ENSEMBL). By default, the function will search 
    annotation from HAVANA but other annotation label/source can be specifed. It is possible to use other type of features than genes present in a 
    gtf file such as transcripts or CDS.


    Examples::

        from sctriangulate.preprocessing import find_genes
        find_genes(adata,gtf_file='gencode.v38.annotation.gtf)
    
    """
    ### extracting the genes
    gtf = {}
    with open(gtf_file) as f:
        for line in f:
            if line[0:2] != '##' and '\t'+feature_type+'\t' in line and '\t'+annotation+'\t' in line:
                line = line.rstrip('\n').split('\t')
                if line[6] == '-':
                    if line[0] not in gtf.keys():
                        gtf[line[0]] = [[int(line[3])-downstream, int(line[4])+upstream,line[-1].split(';')[:-1]]]
                    else:
                        gtf[line[0]].append([int(line[3])-downstream, int(line[4])+upstream,line[-1].split(';')[:-1]])
                else:
                    if line[0] not in gtf.keys():
                        gtf[line[0]] = [[int(line[3])-upstream, int(line[4])+downstream,line[-1].split(';')[:-1]]]
                    else:
                        gtf[line[0]].append([int(line[3])-upstream, int(line[4])+downstream,line[-1].split(';')[:-1]])

    # extracting the feature coordinates
    raw_adata_features = {}
    feature_index = 0
    for line in adata.var_names.tolist():
        line = line.split('_')
        if line[0] not in raw_adata_features.keys():
            raw_adata_features[line[0]] = [[int(line[1]),int(line[2]), feature_index]]
        else:
            raw_adata_features[line[0]].append([int(line[1]),int(line[2]), feature_index])
        feature_index += 1
    
    ## find the genes overlaping the features.
    gene_index = []
    for chrom in raw_adata_features.keys():
        if chrom in gtf.keys():
            chrom_index = 0
            previous_features_index = 0
            for feature in raw_adata_features[chrom]:
                gene_name = []
                feature_start = feature[0]
                feature_end = feature[1]
                for gene in gtf[chrom]:
                    if (gene[1]<= feature_start): # the gene is before the feature. we need to test the next gene.
                        continue
                    elif (feature_end <= gene[0]): # the gene is after the feature. we need to test the next feature.
                        break
                    else: # the window is overlapping the gene. 
                        for n in gene[-1]:
                            if 'gene_name' in n:
                                gene_name.append(n.lstrip('gene_name "').rstrip('""'))
                        
                if gene_name == []:
                    gene_index.append('intergenic')
                elif len(gene_name)==1:
                    gene_index.append(gene_name[0])
                else:
                    gene_index.append(";".join(list(set(gene_name))))
                    
    adata.var[key_added] = gene_index

    
[docs]def reformat_peak(adata,canonical_chr_only=True):
    '''
    To use ``find_genes`` function, please first reformat the peak from 10X format "chr1:10109-10357" to
    find_gene format "chr1_10109_10357"

    :param adata: AnnData
    :param canonical_chr_only: boolean, only kept the canonical chromosome

    :return: AnnData

    Examples::

        from sctriangulate.preprocessing import reformat_peak
        adata = reformat_peak(adata,canonical_chr_only=True)

    '''
    var_names = adata.var_names
    valid = set(['chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10','chr11','chr12','chr13','chr14','chr15','chr16',
             'chr17','chr18','chr19','chr20','chr21','chr22','chrX','chrY'])
    col = []
    for item in var_names:
        chr_ = item.split(':')[0]
        if chr_ in valid:
            start = item.split(':')[1].split('-')[0]
            end = item.split(':')[1].split('-')[1]
            now = '_'.join([chr_,start,end])
            col.append(now)
        else:
            col.append('non_canonical_chr')
    adata.var_names = col
    if canonical_chr_only:
        adata = adata[:,adata.var_names!='non_canonical_chr'].copy()
    return adata


def plot_coexpression(adata,gene1,gene2,kind,hist2d_bins=50,hist2d_cmap=bg_greyed_cmap('viridis'),hist2d_vmin=1e-5,hist2d_vmax=None,
                      scatter_dot_color='blue',contour_cmap='viridis',contour_levels=None,contour_scatter=True,contour_scatter_dot_size=5,
                      contour_train_kde='valid',surface3d_cmap='coolwarm',save=True,outdir='.',name=None):
    x = np.squeeze(make_sure_mat_dense(adata[:,gene1].X))
    y = np.squeeze(make_sure_mat_dense(adata[:,gene2].X))
    if kind == 'scatter':
        fig,ax = plt.subplots()
        ax.scatter(x,y,color=scatter_dot_color)
        ax.set_xlabel('{}'.format(gene1))
        ax.set_ylabel('{}'.format(gene2))
    elif kind == 'hist2d':
        fig,ax = plt.subplots()
        hist2d = ax.hist2d(x,y,bins=hist2d_bins,cmap=hist2d_cmap,vmin=hist2d_vmin,vmax=hist2d_vmax)
        fig.colorbar(mappable=hist2d[3],ax=ax)
        ax.set_xlabel('{}'.format(gene1))
        ax.set_ylabel('{}'.format(gene2))
    elif kind == 'contour':
        from scipy.stats import gaussian_kde
        fig,ax = plt.subplots()
        X,Y = np.meshgrid(np.linspace(x.min(),x.max(),100),np.linspace(y.min(),y.max(),100))
        positions = np.vstack([X.ravel(),Y.ravel()])  # (2, 10000)
        values = np.vstack([x,y])   # (2, 2700)
        if contour_train_kde == 'valid':  # data points that are non-zero for both gene1 and gen2
            values_to_kde = values[:,np.logical_not(np.any(values==0,axis=0))]
        elif contour_train_kde == 'semi_valid':  # data points that are non-zero for at least one of the gene
            values_to_kde = values[:,np.logical_not(np.all(values==0,axis=0))]
        elif contour_train_kde == 'full':   # all data points will be used for kde estimation
            values_to_kde == values
        kernel = gaussian_kde(values_to_kde)  
        density = kernel(positions) # (10000,)
        density = density.reshape(X.shape)  # (100,100)
        cset = ax.contour(X,Y,density,levels=contour_levels,cmap=contour_cmap)
        if contour_scatter:
            dot_density = kernel(values)
            dot_density_color = [cm.viridis(round(np.interp(x=item,xp=[dot_density.min(),dot_density.max()],fp=[0,255]))) for item in dot_density]
            ax.scatter(x,y,c=dot_density_color,s=contour_scatter_dot_size)
        from matplotlib.colors import Normalize
        fig.colorbar(mappable=cm.ScalarMappable(norm=Normalize(),cmap=contour_cmap),ax=ax)
        ax.set_xlabel('{}'.format(gene1))
        ax.set_ylabel('{}'.format(gene2))

    elif kind == 'contourf':
        from scipy.stats import gaussian_kde
        fig,ax = plt.subplots()
        X,Y = np.meshgrid(np.linspace(x.min(),x.max(),100),np.linspace(y.min(),y.max(),100))
        positions = np.vstack([X.ravel(),Y.ravel()])  # (2, 10000)
        values = np.vstack([x,y])   # (2, 2700)
        if contour_train_kde == 'valid':  # data points that are non-zero for both gene1 and gen2
            values_to_kde = values[:,np.logical_not(np.any(values==0,axis=0))]
        elif contour_train_kde == 'semi_valid':  # data points that are non-zero for at least one of the gene
            values_to_kde = values[:,np.logical_not(np.all(values==0,axis=0))]
        elif contour_train_kde == 'full':   # all data points will be used for kde estimation
            values_to_kde == values
        kernel = gaussian_kde(values_to_kde)  
        density = kernel(positions) # (10000,)
        density = density.reshape(X.shape)  # (100,100)
        cfset = ax.contourf(X,Y,density,levels=contour_levels,cmap=contour_cmap) 
        cset = ax.contour(X,Y,density,levels=contour_levels,colors='k')  
        clable = ax.clabel(cset,inline=True,fontsize=5)
        from matplotlib.colors import Normalize
        fig.colorbar(mappable=cm.ScalarMappable(norm=Normalize(),cmap=contour_cmap),ax=ax)
        ax.set_xlabel('{}'.format(gene1))
        ax.set_ylabel('{}'.format(gene2))

    elif kind == 'surface3d':
        fig = plt.figure()
        from scipy.stats import gaussian_kde
        X,Y = np.meshgrid(np.linspace(x.min(),x.max(),100),np.linspace(y.min(),y.max(),100))
        positions = np.vstack([X.ravel(),Y.ravel()])  # (2, 10000)
        values = np.vstack([x,y])   # (2, 2700)
        if contour_train_kde == 'valid':  # data points that are non-zero for both gene1 and gen2
            values_to_kde = values[:,np.logical_not(np.any(values==0,axis=0))]
        elif contour_train_kde == 'semi_valid':  # data points that are non-zero for at least one of the gene
            values_to_kde = values[:,np.logical_not(np.all(values==0,axis=0))]
        elif contour_train_kde == 'full':   # all data points will be used for kde estimation
            values_to_kde == values
        kernel = gaussian_kde(values_to_kde)  
        density = kernel(positions) # (10000,)
        density = density.reshape(X.shape)  # (100,100)
        ax = plt.axes(projection='3d')
        surf = ax.plot_surface(X,Y,density,cmap=surface3d_cmap)
        ax.set_xlabel('{}'.format(gene1))
        ax.set_ylabel('{}'.format(gene2))  
        ax.set_zlabel('PDF for KDE')      
        fig.colorbar(mappable=surf,ax=ax)
    if save:
        if name is None:
            plt.savefig(os.path.join(outdir,'coexpression_{}_{}_{}_plot.pdf'.format(kind,gene1,gene2)),bbox_inches='tight')
            plt.close()
        else:
            plt.savefig(os.path.join(outdir,name),bbox_inches='tight')
            plt.close()            

    return ax


[docs]def umap_color_exceed_102(adata,key,dot_size=None,legend_fontsize=6,outdir='.',name=None,**kwargs):
    '''
    draw a umap that bypass the scanpy 102 color upper bound, this can generate as many as 433 clusters.
    
    :param adata: Anndata
    :param key: the categorical column in adata.obs, which will be plotted
    :param dot_size: None or number
    :param legend_fontsize: defualt is 6
    :param outdir: output directory, default is '.'
    :param name: name of the plot, default is None

    Exmaple::

        from sctriangulate.preprocessing import umap_color_exceed_102
        umap_color_exceed_102(adata,key='leiden6')  # more than 130 clusters

    .. image:: ./_static/more_than102.png
        :height: 550px
        :width: 550px
        :align: center
        :target: target       
    '''
    fig,ax = plt.subplots()
    mapping = colors_for_set(adata.obs[key].unique().tolist())
    color = adata.obs[key].map(mapping).values
    if dot_size is None:
        dot_size = 120000/adata.shape[0]
    ax.scatter(adata.obsm['X_umap'][:,0],adata.obsm['X_umap'][:,1],c=color,s=dot_size,**kwargs)
    import matplotlib.lines as mlines
    ax.legend(handles=[mlines.Line2D([],[],marker='o',linestyle='',color=i) for i in mapping.values()],
            labels=[i for i in mapping.keys()],loc='upper left',bbox_to_anchor=(1,1),ncol=3,frameon=False,prop={'size':6})
    if name is None:
        name = 'umap_{}_exceed_102.pdf'.format(key)
    plt.savefig(os.path.join(outdir,name),bbox_inches='tight')
    plt.close()


def sankey_like_plot(df_raw, col_to_plot,figsize=(6.4,4.8),gradient=False,cmap=None,custom_color=None):
    from matplotlib.patches import Rectangle,Patch,PathPatch
    from matplotlib.path import Path
    alpha = 0.6

    df = df_raw.loc[:,col_to_plot]
    if custom_color is not None:
        color_dict = custom_color
    else:
        color_dict = colors_for_set(df.index.tolist(),gradient=gradient,cmap=cmap)
    total = df.sum(axis=0).values
    fig,ax = plt.subplots(figsize=figsize)
    ax.set_xlim([0,1])
    ax.set_ylim([-0.2,1.2])
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['left'].set_visible(False)
    margin = 0.1
    n = len(col_to_plot)
    ys = [1] * n
    dist = (1 - margin * 2)/ (n + n -1)
    for row in df.itertuples():
        cluster = row.Index
        color = color_dict[cluster]

        for i,col in enumerate(col_to_plot):
            tp = eval('row.{}'.format(col)) / total[i]
            # draw block
            xy_x = 0.1 + i * 2 * dist
            xy_y = ys[i] - tp
            width = dist
            height = tp
            rec = Rectangle((xy_x,xy_y),width,height,edgecolor='k',facecolor=color,lw=0.5)
            ax.add_patch(rec)
            # draw line
            if i < n-1:
                start_up = (xy_x+dist,ys[i])
                end_up = (xy_x+dist+dist,ys[i+1])
                control_start_up = (xy_x+dist+dist/2,ys[i])
                control_end_up = (xy_x+dist+dist/2,ys[i+1])
                start_down = (xy_x+dist,ys[i]-tp)
                end_down = (xy_x+dist+dist,ys[i+1]-(eval('row.{}'.format(col_to_plot[i+1])) / total[i+1]))
                control_start_down = (xy_x+dist+dist/2,ys[i]-tp)
                control_end_down = (xy_x+dist+dist/2,ys[i+1]-(eval('row.{}'.format(col_to_plot[i+1])) / total[i+1]))
                p = Path([start_up,control_start_up,control_end_up,end_up,end_down,control_end_down,control_start_down,start_down,start_up],
                        [Path.MOVETO,Path.CURVE4,Path.CURVE4,Path.CURVE4,Path.LINETO,Path.CURVE4,Path.CURVE4,Path.CURVE4,Path.CLOSEPOLY])  
                pp = PathPatch(p,fc=color,alpha=alpha,lw=0.5)      
                ax.add_patch(pp)
            ys[i] = ys[i] - tp

    ax.legend(handles=[Patch(color=v) for v in color_dict.values()],labels=[k for k in color_dict.keys()],loc='upper left',bbox_to_anchor=(1,1),ncol=2,frameon=False)
    ax.set_xticks([0.1 + dist/2 + i * 2 * dist for i in range(n)])
    ax.set_xticklabels(col_to_plot)
    ax.set_yticks([])
    plt.savefig('sankey_like_plot_multiple_cols_{}.pdf'.format('_'.join(col_to_plot)),bbox_inches='tight')
    plt.close()

def multiple_column_sankey(adata,annotations,opacity=0.6,pad=3,thickness=10,margin=300,text=True,save=True,as_html=True,outdir='.'):
    import plotly.graph_objects as go
    all_node_label = []
    all_node_color = []
    all_link_source = []
    all_link_target = []
    all_link_value = []
    all_link_pert = []
    all_link_color = []

    for i, anno in enumerate(annotations):
        if i < len(annotations) - 1:
            left_annotation = anno
            right_annotation = annotations[i+1]
            df = adata.obs.loc[:,[left_annotation,right_annotation]]
            node_label = df[left_annotation].unique().tolist() + df[right_annotation].unique().tolist()
            node_color = pick_n_colors(len(node_label))
            all_node_label.extend(node_label)
            all_node_color.extend(node_color)

            link = []
            for source,sub in df.groupby(by=left_annotation):
                for target,subsub in sub.groupby(by=right_annotation):
                    if subsub.shape[0] > 0:
                        link.append((source,target,subsub.shape[0],subsub.shape[0]/sub.shape[0]))
            link_info = list(zip(*link))
            link_source = [all_node_label.index(item) for item in link_info[0]]
            link_target = [all_node_label.index(item) for item in link_info[1]]
            link_value = link_info[2]
            link_pert = [round(item,2) for item in link_info[3]]
            link_color = ['rgba{}'.format(tuple([infer_to_256(item) for item in to_rgb(all_node_color[i])] + [opacity])) for i in link_source]

            all_link_source.extend(link_source)
            all_link_target.extend(link_target)
            all_link_value.extend(link_value)
            all_link_pert.extend(link_pert)
            all_link_color.extend(link_color)


    node_plotly = dict(pad = pad, thickness = thickness,line = dict(color = "grey", width = 0.1),label = all_node_label,color = all_node_color)
    link_plotly = dict(source=all_link_source,target=all_link_target,value=all_link_value,color=all_link_color,customdata=all_link_pert,
                       hovertemplate='%{source.label} -> %{target.label} <br /> number of cells: %{value} <br /> percentage: %{customdata}')
    if not text:
        fig = go.Figure(data=[go.Sankey(node = node_plotly,link = link_plotly, textfont=dict(color='rgba(0,0,0,0)',size=1))])
    else:
        fig = go.Figure(data=[go.Sankey(node = node_plotly,link = link_plotly)])
    fig.update_layout(title_text='sankey_multiple', font_size=6, margin=dict(l=margin,r=margin))
    if save:
        if not as_html:
            fig.write_image(os.path.join(outdir,'multiple_column_sankey.pdf'))
        else:
            fig.write_html(os.path.join(outdir,'multiple_column_sankey.html'),include_plotlyjs='cdn') 


def custom_two_column_sankey(adata,left_annotation,right_annotation,opacity=0.6,pad=3,thickness=10,margin=300,text=True,save=True,as_html=True,outdir='.'):
    import plotly.graph_objects as go
    import kaleido
    df = adata.obs.loc[:,[left_annotation,right_annotation]]
    node_label = df[left_annotation].unique().tolist() + df[right_annotation].unique().tolist()
    node_color = pick_n_colors(len(node_label))
    link = []
    for source,sub in df.groupby(by=left_annotation):
        for target,subsub in sub.groupby(by=right_annotation):
            if subsub.shape[0] > 0:
                link.append((source,target,subsub.shape[0],subsub.shape[0]/sub.shape[0]))
    link_info = list(zip(*link))
    link_source = [node_label.index(item) for item in link_info[0]]
    link_target = [node_label.index(item) for item in link_info[1]]
    link_value = link_info[2]
    link_pert = [round(item,2) for item in link_info[3]]
    
    link_color = ['rgba{}'.format(tuple([infer_to_256(item) for item in to_rgb(node_color[i])] + [opacity])) for i in link_source]
    node_plotly = dict(pad = pad, thickness = thickness,line = dict(color = "grey", width = 0.1),label = node_label,color = node_color)
    link_plotly = dict(source=link_source,target=link_target,value=link_value,color=link_color,customdata=link_pert,
                       hovertemplate='%{source.label} -> %{target.label} <br /> number of cells: %{value} <br /> percentage: %{customdata}')
    if not text:
        fig = go.Figure(data=[go.Sankey(node = node_plotly,link = link_plotly, textfont=dict(color='rgba(0,0,0,0)',size=1))])
    else:
        fig = go.Figure(data=[go.Sankey(node = node_plotly,link = link_plotly)])
    fig.update_layout(title_text='sankey_{}_{}'.format(left_annotation,right_annotation), font_size=6, margin=dict(l=margin,r=margin))
    if save:
        if not as_html:
            fig.write_image(os.path.join(outdir,'two_column_sankey_{}_{}_text_{}.pdf'.format(left_annotation,right_annotation,text))) 
        else:
            fig.write_html(os.path.join(outdir,'two_column_sankey_{}_{}_text_{}.html'.format(left_annotation,right_annotation,text)),include_plotlyjs='cdn') 


[docs]def rna_umap_transform(outdir,ref_exp,ref_group,q_exp_list,q_group_list,q_identifier_list,pca_n_components,umap_min_dist=0.5):
    '''
    Take a reference expression matrix (pandas dataframe), and a list of query expression matrics (pandas dataframe),
    along with a list of query expression identifiers. This function will generate the umap transformation of your reference exp,
    and make sure to squeeze your each query exp to the same umap transformation.

    :param outdir: the path in which all the results will go
    :param ref_exp: the pandas dataframe object,index is the features, column is the cell barcodes
    :param ref_group: the pandas series object, index is the cell barcodes, the value is the clusterlabel information
    :param q_exp_list: the list of pandas dataframe object, requirement is the same as ref_exp
    :param q_group_list: the list of pandas series object, requirement is the same as ref_group
    :param q_identifier_list: the list of string, to denote the name of each query dataset
    :param pca_n_components: the int, denote the PCs to use for PCA
    :param umap_min_dist: the float number, denote the min_dist parameter for umap program

    Examples::

        rna_umap_transform(outdir='.',ref_exp=ref_df,ref_group=ref_group_df,q_exp_list=[q1_df],q_group_list=[q1_group_df],q_identifier_list=['q1'],pca_n_components=50)
    '''
    # create outdir if not exist
    if not os.path.exists(outdir):
        os.mkdir(outdir)

    ref_variable = ref_exp.index.values
    store_df = []
    store_variable = []
    for q_exp in q_exp_list:
        q_variable = q_exp.index.values
        store_df.append(q_exp)
        store_variable.append(q_variable)

    # find common features between ref and all qs.
    all_variable = copy.deepcopy(store_variable)
    all_variable.insert(0,ref_variable)
    common_variable = list(reduce(lambda a,b: set(a).intersection(set(b)),all_variable))

    # subset the exps, also transpose
    ref_exp_common = ref_exp.loc[common_variable,:].T
    store_df_common = []
    for q_exp in store_df:
        q_exp_common = q_exp.loc[common_variable,:].T
        store_df_common.append(q_exp_common)


    # train pca and umap on ref
    ref_pca_model = PCA(n_components = pca_n_components).fit(ref_exp_common.values)
    ref_pca_score = ref_pca_model.transform(ref_exp_common.values)
    ref_umap_model = umap.UMAP(min_dist=umap_min_dist).fit(ref_pca_score)
    ref_umap_embed = ref_umap_model.embedding_

    # transform all the querys
    store_q_umap = []
    for q_exp_common in store_df_common:
        q_pca_score = ref_pca_model.transform(q_exp_common.values)
        q_umap_embed = ref_umap_model.transform(q_pca_score)
        store_q_umap.append(q_umap_embed)


    ref_umap_embed_df = pd.DataFrame(data=ref_umap_embed,index=ref_exp_common.index,columns=['umap_x','umap_y'])
    ref_umap_embed_df.to_csv(os.path.join(outdir,'ref_umap.txt'),sep='\t')
    for i,item in enumerate(store_q_umap):
        q_exp_common = store_df_common[i]
        item = pd.DataFrame(data=item,index=q_exp_common.index,columns=['umap_x','umap_y'])
        item.to_csv(os.path.join(outdir,'query_{}_umap.txt'.format(q_identifier_list[i])),sep='\t')


    # visualization
    all_identifier = ['ref'] + q_identifier_list
    all_exp = [ref_exp_common] + store_df_common
    all_label_mapping = [group_df.to_dict() for group_df in [ref_group] + q_group_list]
    all_umap = [ref_umap_embed] + store_q_umap
    for i,exp,label_map,embed in zip(all_identifier,all_exp,all_label_mapping,all_umap):
        adata = ad.AnnData(X=exp.values,obs=pd.DataFrame(index=exp.index),var=pd.DataFrame(index=exp.columns))
        adata.obs['label'] = adata.obs_names.map(label_map).fillna('unknown').values
        adata.obsm['X_umap'] = embed
        sc.pl.umap(adata,color='label',legend_loc='on data',legend_fontsize=6)
        plt.savefig(os.path.join(outdir,'umap_{}.pdf'.format(i)),bbox_inches='tight')
        plt.close()