Source code for vipcca.tools.utils

import scipy
from scipy.sparse import csr_matrix, csc_matrix
from sklearn.preprocessing import QuantileTransformer
import numpy as np
import pandas as pd
import scanpy as sc
from anndata import AnnData
import os
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
matplotlib.use('Agg')
sns.set(style='white', rc={'figure.figsize': (5, 5), 'figure.dpi': 150})


[docs]def read_sc_data(input_file, fmt='h5ad', backed=None, transpose=False, sparse=False, delimiter=" ", unique_name=True, batch_name=None, var_names="gene_symbols"): """\ Read single cell dataset Parameters ---------- input_file : string The path of the file to be read. fmt : string, optional (default: 'h5ad') The file type of the file to be read. backed : Union[Literal[‘r’, ‘r+’], bool, None] (default: None) If 'r', load AnnData in backed mode instead of fully loading it into memory (memory mode). If you want to modify backed attributes of the AnnData object, you need to choose 'r+'. transpose: bool, optional (default: False) Whether to transpose the read data. sparse: bool, optional (default: False) Whether the data in the dataset is stored in sparse matrix format. delimiter: str, optional (default: ' ') Delimiter that separates data within text file. If None, will split at arbitrary number of white spaces, which is different from enforcing splitting at single white space ' '. unique_name: bool, optional (default: False) If Ture, AnnData object execute var_names_make_unique() and obs_names_make_unique() functions. batch_name: string, optional (default: None) Batch name of current batch data var_names: Literal[‘gene_symbols’, ‘gene_ids’] (default: 'gene_symbols') The variables index when the file type is 'mtx'. Returns ------- :class:`~anndata.AnnData` adata """ if fmt == '10x_h5': adata = sc.read_10x_h5(input_file) elif fmt == '10x_mtx': adata = sc.read_10x_mtx(input_file, var_names=var_names) elif fmt == "mtx": adata = sc.read_mtx(input_file) elif fmt == 'h5ad': adata = sc.read_h5ad(input_file, backed=backed) elif fmt == "csv": adata = sc.read_csv(input_file) elif fmt == "txt": adata = sc.read_text(input_file, delimiter=delimiter) elif fmt == "tsv": adata = sc.read_text(input_file, delimiter="\t") else: raise ValueError('`format` needs to be \'10x_h5\' or \'10x_mtx\'') if transpose: adata = adata.transpose() if sparse: adata.X = csr_matrix(adata.X, dtype='float32') if unique_name: adata.var_names_make_unique() adata.obs_names_make_unique() if batch_name is not None: adata.obs["_batch"]=batch_name return adata
def concatenate_datasets(datasets): """/ concatenate single cell datasets Parameters ---------- datasets: list List of AnnData objects to be concatenate. Returns ------- :class:`~anndata.AnnData` adata """ adata = datasets[0].concatenate(datasets[1]) for i in range(2, len(datasets)): adata = adata.concatenate(datasets[i]) return adata
[docs]def spatial_preprocessing(datasets, min_cells=1, min_genes=1, n_top_genes=2000, lognorm=True, hvg=True): """\ Preprocess and merge two visium datasets from different batches Parameters ---------- datasets: list, optional (default: None) The list of anndata objects from different batches min_cells: int, optional (default: 1) Minimum number of counts required for a cell to pass filtering. min_genes: int, optional (default: 1) Minimum number of counts required for a gene to pass filtering. n_top_genes: int, optional (default: 2000) Number of highly-variable genes to keep. lognorm: bool, optional (default: True) If True, execute lognorm() function. hvg: bool, optional (default: True) If True, choose hypervariable genes for AnnData object. Returns ------- :class:`~anndata.AnnData` adata """ for i in range(len(datasets)): sc.pp.normalize_total(datasets[i], inplace=True) sc.pp.log1p(datasets[i]) sc.pp.highly_variable_genes(datasets[i], flavor="seurat", n_top_genes=n_top_genes, inplace=True) datasets[i].obs['loss_weight'] = i adata_spatial = datasets[0].concatenate( datasets[1], batch_key="library_id", uns_merge="unique", batch_categories=[ k for d in [ datasets[0].uns["spatial"], datasets[1].uns["spatial"], ] for k, v in d.items() ], ) return adata_spatial
[docs]def spatial_rna_preprocessing(adata_spatial, adata_rna, lognorm=True, hvg=True, n_top_genes=2000): '''\ Preprocess and merge visium dataset with scRNA-seq dataset. Parameters ---------- adata_spatial: AnnData AnnData object of visium dataset. adata_rna: AnnData AnnData object of scRNA-seq dataset. lognorm: bool, optional (default: True) If True, execute lognorm() function. hvg: bool, optional (default: True) If True, choose hypervariable genes for AnnData object. n_top_genes: int, optional (default: 2000) Number of highly-variable genes to keep. Returns ------- :class:`~anndata.AnnData` adata ''' if lognorm: sc.pp.log1p(adata_spatial) sc.pp.log1p(adata_rna) if hvg: sc.pp.highly_variable_genes(adata_spatial, n_top_genes=8000) sc.pp.highly_variable_genes(adata_rna, n_top_genes=8000) adata_spatial = adata_spatial[:, adata_spatial.var.highly_variable] adata_rna = adata_rna[:, adata_rna.var.highly_variable] adata_spatial.obs['loss_weight'] = 0.0 adata_rna.obs['loss_weight'] = 1.0 adata = adata_spatial.concatenate(adata_rna) return adata
def generate_batch_code(batch, batch_num): batch_code = [] if batch_num == 2: batch_code = to_categorical(batch) if batch_num == 3: for i in range(batch.shape[0]): if batch[i] == 0: batch_code.append([1, 1, 0, 1]) elif batch[i] == 1: batch_code.append([0, 0, 1, 1]) else: batch_code.append([0, 1, 1, 0]) batch_num = batch_num + 1 else: batch_code = to_categorical(batch) batch_code = np.array(batch_code) return batch_code, batch_num def add_location(adata): adata_loc = AnnData(adata.obs[['xcoord','ycoord']]) adata = adata.T.concatenate(adata_loc.T, index_unique=None).T adata.obs.rename(columns={'xcoord-0':'xcoord','ycoord-0':'ycoord'},inplace=True) return adata def write2mtx(adata, path): if not os.path.exists(path): os.makedirs(path) gn=adata.var bc=adata.obs gn.to_csv(path+"genes.tsv",sep="\t",index=True,header=False) bc.to_csv(path+"annotation.tsv",sep="\t",index=True,header=False) bc.to_csv(path+"barcodes.tsv",index=True,header=False,columns=[]) scipy.io.mmwrite(path+"matrix.mtx", adata.X.transpose().astype(int)) def recipe_vipcca(adata, n_top_genes=2000, ncounts=1e6, min_cells=10, min_genes=10, max_genes=2500,mt_ratio=0.05,lognorm=True,loc=False,filter=True,hvg=True): if filter: sc.pp.filter_genes(adata,min_cells=min_cells) sc.pp.filter_cells(adata,min_genes=min_genes) mito_genes = adata.var_names.str.upper().str.startswith('MT-') adata.obs['percent_mito'] = np.sum(adata[:, mito_genes].X, axis=1).A1 / np.sum(adata.X, axis=1).A1 adata = adata[adata.obs['n_genes'] < max_genes, :] adata = adata[adata.obs['percent_mito'] < mt_ratio, :] if lognorm: adata_norm = logNormalization(adata, loc=loc) sc.pp.highly_variable_genes(adata_norm, flavor='seurat', n_top_genes =n_top_genes , inplace=True) if loc: adata_norm.var.loc[['xcoord','ycoord'],'highly_variable']=[True,True] features = adata_norm.var_names[adata_norm.var['highly_variable']] adata_norm = adata_norm[:,features] sc.pp.filter_cells(adata_norm, min_genes=5) adata = adata[adata_norm.obs.index,:] adata = adata[:,features] adata_norm.raw = adata.copy() return adata_norm, adata
[docs]def preprocessing(datasets, min_cells=1, min_genes=1, n_top_genes=2000, mt_ratio=0.8, lognorm=True, hvg=True, index_unique=None): """\ Preprocess and merge data sets from different batches Parameters ---------- datasets: list, optional (default: None) the list of anndata objects from different batches min_cells: int, optional (default: 1) Minimum number of counts required for a cell to pass filtering. min_genes: int, optional (default: 1) Minimum number of counts required for a gene to pass filtering. n_top_genes: int, optional (default: 2000) Number of highly-variable genes to keep. mt_ratio: double, optional (default: 0.8) Maximum proportion of mito genes for a cell to pass filtering. lognorm: bool, optional (default: True) If True, execute lognorm() function. hvg: bool, optional (default: True) If True, choose hypervariable genes for AnnData object. index_unique: string, optional (default: None) Make the index unique by joining the existing index names with the batch category, using index_unique='-', for instance. Provide None to keep existing indices. Returns ------- :class:`~anndata.AnnData` adata_norm """ if lognorm: for i in range(len(datasets)): sc.pp.filter_genes(datasets[i],min_cells=min_cells) sc.pp.filter_cells(datasets[i],min_genes=min_genes) mito_genes = datasets[i].var_names.str.upper().str.startswith('MT-') datasets[i].obs['percent_mito'] = np.sum(datasets[i][:, mito_genes].X, axis=1).A1 / np.sum(datasets[i].X, axis=1).A1 datasets[i] = datasets[i][datasets[i].obs['percent_mito'] < mt_ratio, :] datasets[i]=logNormalization(datasets[i]) all_features=pd.Index([]) for i in range(len(datasets)): all_features=all_features.union(datasets[i].var_names) common_features=all_features for i in range(len(datasets)): common_features=common_features.intersection(datasets[i].var.index) df_var = pd.DataFrame(index=all_features) df_var['selected']=0 if hvg: for i in range(len(datasets)): sc.pp.highly_variable_genes(datasets[i], flavor='seurat', n_top_genes =n_top_genes , inplace=True) features = datasets[i].var_names[datasets[i].var['highly_variable']] df_var.loc[features]+=1 df_var=df_var.loc[common_features] df_var.sort_values(by="selected", ascending=False, inplace=True) selected_features=df_var.index[range(n_top_genes)] else: selected_features=common_features for i in range(len(datasets)): datasets[i] = datasets[i][:,selected_features] if i==0: adata=datasets[i] else: adata=adata.concatenate(datasets[i],index_unique =index_unique) return adata
def split_object(adata,by="batch"): adata.obs[by]=adata.obs[by].astype("category") index_cat = adata.obs[by].cat.categories datasets=[] for cat in index_cat: data=adata[adata.obs[by]==cat,:] datasets.append(data) return datasets def scale(adata,max_value=None,use_rep=None): if use_rep is None: sc.pp.scale(adata,max_value=max_value) else: maxv=adata.obsm[use_rep].max(0) minv=adata.obsm[use_rep].min(0) adata.obsm[use_rep]=(adata.obsm[use_rep]-minv)/(maxv-minv) def rank_normalization(adata,n_quantiles=2000): qt = QuantileTransformer(n_quantiles=n_quantiles,output_distribution='normal',ignore_implicit_zeros=False) adata.X = qt.fit_transform(adata.X.toarray()) def logNormalization(adata,loc=False): adata.obs['n_counts'] = adata.X.sum(axis=1).A1 adata.obs['size_factor']=adata.obs['n_counts']/1e6 adata.raw=adata.copy() sc.pp.normalize_total(adata,target_sum=1e6) if loc: adata_loc = add_location(adata) adata_loc.obs.rename(columns={'n_counts-0':'n_counts','size_factor-0':'size_factor'},inplace=True) sc.pp.log1p(adata_loc) return adata_loc else: sc.pp.log1p(adata) return adata def createCoordination(adata, result_path=None, plot=False, resolution=1): xmax=np.ceil(adata.obs.xcoord.max())+resolution ymax=np.ceil(adata.obs.ycoord.max())+resolution nn=(xmax/resolution).astype("uint16") mm=(ymax/resolution).astype("uint16") img_size=np.int(np.ceil(np.max([nn,mm])/216)*216) mX=np.zeros(shape=(img_size,img_size),dtype="uint16") xcoord_img = (adata.obs.xcoord.values/resolution).astype("uint16") ycoord_img = (adata.obs.ycoord.values/resolution).astype("uint16") mX[ycoord_img,xcoord_img]+=1 adata.obs["xcoord_img"]=xcoord_img adata.obs["ycoord_img"]=ycoord_img count = csr_matrix(([1]*len(xcoord_img),(xcoord_img,ycoord_img)),shape=(img_size,img_size),dtype="int") adata.obs["overlapped"]=count.toarray()[xcoord_img,ycoord_img] adata.X = csc_matrix((adata.X.T.todense()/adata.obs.overlapped.values).T) size_factor = np.zeros(shape=(img_size,img_size,1),dtype="float32") size_factor[xcoord_img,ycoord_img,0] = adata.obs["size_factor"] if plot: plt.scatter(xcoord_img,ycoord_img,s=1,c='r') plt.title("All cells in one image") plt.savefig(result_path+"image_cells.png") plt.close() return img_size, img_size, size_factor def generate_img_from_genes(adata,img_size=648,batch_size=32, cols=[6,7]): while True: for i in range(adata.shape[1]): data=adata.X.getcol(i).data ind=adata.X.getcol(i).indices xx = adata.obs.iloc[ind,cols[0]].values yy = adata.obs.iloc[ind,cols[1]].values img = csr_matrix((data,(yy,xx)),shape=(img_size,img_size),dtype="f4").toarray().reshape(1,img_size,img_size,1) yield (img,img) def generate_datasets(adata,img_size=648, cols=[6,7], count_only=False): n_genes=adata.shape[1] x=np.zeros(shape=(n_genes,img_size,img_size,1),dtype="float32") x_label=np.zeros_like(x) for i in range(n_genes): fi=i data=adata.X.getcol(fi).data ind=adata.X.getcol(fi).indices xx = adata.obs.ix[ind, 'xcoord_img'].values yy = adata.obs.ix[ind, 'ycoord_img'].values ## accumalate cells in the same location x[i,]=csr_matrix((data,(xx,yy)),shape=(img_size,img_size),dtype="float32").toarray().reshape(img_size,img_size,1) if not count_only: rawdata=adata.raw.X.getcol(fi).data x_label[i,] = csr_matrix((rawdata,(xx,yy)),shape=(img_size,img_size),dtype="float32").toarray().reshape(img_size,img_size,1) else: x_label=x return x, x_label ## generate data used for fit_gen() # x_train, x_train_label, x_val, x_val_label = pp.generate_datasets(adata,img_size=img_size,validation_split=validation_split) # datagen = ImageDataGenerator() # generator_train=datagen.flow(x_train, x_train_label, batch_size=batch_size) # generator_val=datagen.flow(x_val, x_val_label, batch_size=batch_size) # unet.fit_gen(generator_train, validation_data=generator_val) def generate_datasets_ext(adata,img_size=648,validation_split=0.2): n_genes=adata.shape[1] samples_for_train=n_genes-np.int(n_genes*validation_split) id_list=np.random.permutation(n_genes) x_train=np.zeros((samples_for_train,img_size,img_size,1),dtype="f4") x_train_label=np.zeros_like(x_train) x_val=np.zeros((n_genes-samples_for_train,img_size,img_size,1),dtype="f4") x_val_label=np.zeros_like(x_val) p=0 for i in id_list[range(samples_for_train)]: data=adata.X.getcol(i).data rawdata=adata.raw.X.getcol(i).data ind=adata.X.getcol(i).indices xx = adata.obs.iloc[ind,6].values yy = adata.obs.iloc[ind,7].values x_train[p,] = csr_matrix((data,(yy,xx)),shape=(img_size,img_size),dtype="f4").toarray().reshape(1,img_size,img_size,1) x_train_label[p,] = csr_matrix((rawdata,(yy,xx)),shape=(img_size,img_size),dtype="f4").toarray().reshape(1,img_size,img_size,1) p+=1 p=0 for i in id_list[np.arange(samples_for_train,n_genes)]: data=adata.X.getcol(i).data rawdata=adata.raw.X.getcol(i).data ind=adata.X.getcol(i).indices xx = adata.obs.iloc[ind,6].values yy = adata.obs.iloc[ind,7].values x_val[p,] = csr_matrix((data,(yy,xx)),shape=(img_size,img_size),dtype="f4").toarray().reshape(1,img_size,img_size,1) x_val_label[p,] = csr_matrix((rawdata,(yy,xx)),shape=(img_size,img_size),dtype="f4").toarray().reshape(1,img_size,img_size,1) p+=1 return x_train, x_train_label, x_val, x_val_label