染色体位置如何注释到基因名

1. 环境准备

python
gff3文件：https://www.gencodegenes.org/human/release_19.html
python包：

pip3 install pandas, pysam, numpy

2. 输入文件

#Chr    Begin   End Size(Mb)    Nb_variants Percentage_homozygosity
chr1    37356469    39797055    2.44    57  89.47
chr1    89731959    91742055    2.01    31  90.32
chr1    92327126    94544234    2.22    59  91.53
chr1    172100831   176102969   4   78  91.03
chr2    11718591    15732002    4.01    26  100
chr2    39262068    41384390    2.12    26  92.31
chr2    59614572    62047433    2.43    33  93.94
chr2    80809027    85143406    4.33    27  92.59
chr2    143787161   159672168   15.89   163 93.87

3. 输出文件

样式1

chr start   end Size(Mb)    Nb_variants Percentage_homozygosity genes
chr1    37356469    39797055    2.44    57  89.47   GRIK3(5.87);ZC3H12A(0.4);MEAF6(0.91);SNIP1(0.81);DNALI1(0.41);GNL2(1.19);RSPO1(0.97);C1orf109(0.44);CDCA8(0.71);EPHA10(2.1);MANEAL(0.3);YRDC(0.21);C1orf122(0.1);MTF1(2.05);INPP5B(3.54);SF3A3(1.39);FHL3(0.36);UTP11L(0.64);POU3F1(0.12);RRAGC(0.89);MYCBP(0.76);GJA9(0.7);RHBDL2(2.29);AKIRIN1(0.61);NDUFS5(0.34);MACF1(10.25)
chr1    89731959    91742055    2.01    31  90.32   GBP5(0.33);GBP6(1.11);LRRC8B(3.63);LRRC8C(6.81);LRRC8D(5.75);ZNF326(2.01);BARHL2(0.28);ZNF644(5.32);HFM1(0.78)
chr1    92327126    94544234    2.22    59  91.53   TGFBR3(2.02);BRDT(2.93);EPHX4(1.51);BTBD8(3.05);KIAA1107(0.8);C1orf146(1.26);GLMN(2.37);RPAP2(4.65);GFI1(0.55);EVI5(12.8);RPL5(0.45);FAM69A(5.38);MTF2(2.7);TMED5(1.4);CCDC18(4.46);DR1(1.06);FNBP1L(4.8);BCAR3(12.87);DNTTIP2(0.55);GCLM(1.09);ABCA4(3.87)
chr1    172100831   176102969   4   78  91.03   DNM3(7.17);PIGC(1.85);C1orf105(1.2);SUCO(1.99);FASLG(0.2);TNFSF18(0.27);TNFSF4(0.59);PRDX6(0.29);SLC9C2(2.56);ANKRD45(1.51);KLHL20(1.79);CENPL(0.63);DARS2(0.85);ZBTB37(0.89);SERPINC1(0.34);RC3H1(2.28);RABGAP1L(20.89);GPR52(0.04);CACYBP(0.31);MRPS14(0.32);TNN(2.0);KIAA0040(0.9);TNR(10.71);RFWD2(4.72)
chr2    11718591    15732002    4.01    26  100 GREB1(1.6);NTSR2(0.3);LPIN1(3.73);TRIB2(0.64);FAM84A(0.45);AC011897.1(0.39);NBAS(9.83);DDX1(0.02)
chr2    39262068    41384390    2.12    26  92.31   SOS1(4.21);CDKL4(2.54);MAP4K3(8.86);TMEM178A(2.5);THUMPD2(2.04);SLC8A1(24.21)
chr2    59614572    62047433    2.43    33  93.94   BCL11A(4.21);PAPOLG(1.88);REL(2.06);PUS10(3.21);PEX13(1.43);KIAA1841(4.07);C2orf74(0.81);AHSA2(0.57);USP34(11.64);XPO1(2.5)
chr2    80809027    85143406    4.33    27  92.59   CTNNA2(1.54);SUCLG1(0.84);DNAH6(6.99);TRABD2A(1.97);TMSB10(0.02)
chr2    143787161   159672168   15.89   163 93.87   KYNU(0.08);ARHGAP15(4.26);GTDC1(2.48);ZEB2(0.88);ACVR2A(0.54);ORC4(0.57);MBD5(3.13);EPC2(0.9);KIF5C(1.58);LYPD6B(1.12);LYPD6(0.91);MMADHC(0.11);RND3(0.45);RBM43(0.09);NMI(0.12);TNFAIP6(0.14);RIF1(0.62);NEB(1.57);ARL5A(0.25);CACNB4(1.68);STAM2(0.37);FMNL2(1.98);PRPF40A(0.42);ARL6IP6(0.27);RPRM(0.01);GALNT13(3.66);KCNJ3(1.01);NR4A2(0.11);GPD2(1.12);GALNT5(0.36);ERMN(0.06);CYTIP(0.47);ACVR1C(0.64);ACVR1(0.88);UPP2(1.63);CCDC148(1.8);PKP4(1.42);DAPL1(0.13)

输出样式2

chr start   end Size(Mb)    Nb_variants Percentage_homozygosity gene_name   gene_coverage
chr1    37356469    39797055    2.44    57  89.47   GRIK3   5.87
chr1    37356469    39797055    2.44    57  89.47   ZC3H12A 0.4
chr1    37356469    39797055    2.44    57  89.47   MEAF6   0.91
chr1    37356469    39797055    2.44    57  89.47   SNIP1   0.81
chr1    37356469    39797055    2.44    57  89.47   DNALI1  0.41
chr1    37356469    39797055    2.44    57  89.47   GNL2    1.19
chr1    37356469    39797055    2.44    57  89.47   RSPO1   0.97
chr1    37356469    39797055    2.44    57  89.47   C1orf109    0.44
chr1    37356469    39797055    2.44    57  89.47   CDCA8   0.71
chr1    37356469    39797055    2.44    57  89.47   EPHA10  2.1
chr1    37356469    39797055    2.44    57  89.47   MANEAL  0.3
chr1    37356469    39797055    2.44    57  89.47   YRDC    0.21

3. 提取gff3文件的python代码

# 导入包
import pandas as pd
import numpy as np
import pysam
# 读取gff3文件
gencode = pd.read_table("/mnt/f/genome/Homo_sapiens_gff3/gencode.v19.annotation.gff3", comment="#",
                        sep = "\t", names = ['seqname', 'source', 'feature', 'start' , 'end', 'score', 'strand', 'frame', 'attribute'])
gencode.head()

提取gff3文件中所需要的信息

gencode_genes = gencode[(gencode.feature == "gene")][['seqname', 'start', 'end', 'attribute']].copy().reset_index().drop('index', axis=1) # Extract genes
gencode_genes.head()

提取基因名，基因类型，基因状态，基因水平（1verified loci，2manually annotated loci,3automatically annotated loci）

def gene_info(x):
    # Extract gene names
    g_name = list(filter(lambda x: 'gene_name' in x,  x.split(";")))[0].split("=")[1]
    g_type = list(filter(lambda x: 'gene_type' in x,  x.split(";")))[0].split("=")[1]
    g_status = list(filter(lambda x: 'gene_status' in x,  x.split(";")))[0].split("=")[1]
    g_leve = int(list(filter(lambda x: 'level' in x,  x.split(";")))[0].split("=")[1])
    return (g_name, g_type, g_status, g_leve)

gencode_genes["gene_name"], gencode_genes["gene_type"], gencode_genes["gene_status"], gencode_genes["gene_level"] = zip(*gencode_genes.attribute.apply(lambda x: gene_info(x)))
gencode_genes.head()

查看基因类型分布

gencode_genes['gene_type'].drop_duplicates()

提取所有已知的蛋白编码基因

gencode_genes = gencode_genes[gencode_genes['gene_status'] == 'KNOWN'].reset_index().drop('index', axis=1)
gencode_genes = gencode_genes[gencode_genes['gene_type'] == 'protein_coding'].reset_index().drop('index', axis=1)
gencode_genes.info()

# 移除重复的记录并保存
## Sort gene_leve in each chromosome (ascending oder) and remove duplicates
gencode_genes = gencode_genes.sort_values(['gene_level', 'seqname'], ascending=True).drop_duplicates('gene_name', keep='first').reset_index().drop('index', axis=1) 
gencode_genes.to_csv('/mnt/f/genome/Homo_sapiens_gff3/gencode.v19.annotation.gff3_all_known_genes.txt', index=False, header = False, sep="\t")
gencode_genes.info()

压缩

%%bash -s /mnt/f/genome/Homo_sapiens_gff3/gencode.v19.annotation.gff3_all_known_genes.txt
cut -f 1,2,3,5 $1 | sortBed -i > /mnt/f/genome/Homo_sapiens_gff3/gencode.v19.annotation.gff3_all_known_genes.txt.sorted.formatted.bed
bgzip /mnt/f/genome/Homo_sapiens_gff3/gencode.v19.annotation.gff3_all_known_genes.txt.sorted.formatted.bed
tabix -p bed /mnt/f/genome/Homo_sapiens_gff3/gencode.v19.annotation.gff3_all_known_genes.txt.sorted.formatted.bed.gz
ls -l

4. 注释染色体位置包含的基因名

定义重叠位置计算函数

def overlap(q_st, q_end, res_st, res_end):
    o  = min(q_end, res_end)-max(q_st, res_st)
    return o

定义输入与输入文件目录

import os
input_path = "/mnt/f/projects/gene/patient1/"
input_file_name = "patient1.HomRegions.tsv"
input_file = os.path.join(input_path,input_file_name)
# 读取输入文件
### Read the sample file in to a pandas dataframe
df = pd.read_table(input_file, 
                   names=['chr', 'start', 'end', 'Size(Mb)', 'Nb_variants', 'Percentage_homozygosity'], skiprows=1)
df.head()

计算重叠基因及重叠部位占染色体位置的百分比

def gencode_all_known_genes(a, tb):
    genes = []

    try:
        for region in tb.fetch(a['chr'], int(a['start']), int(a['end'])):
            if region:
                r = region.split('\t')
                overlap_len = overlap(int(a['start']), int(a['end']), int(r[1]), int(r[2]))
                ret_val = '{}({})'.format(r[3], np.round(overlap_len/float(int(a['end'])-int(a['start']))*100, 2)) ### Percentage of the input interval that overlap with the gene
                genes.append(ret_val) 

        if len(genes)>0:
            return ";".join(genes)
        else:
            return "NA(0)"
    except ValueError:
        return "NA(0)"

import pysam
gencode_v19 = pysam.TabixFile('/mnt/f/genome/Homo_sapiens_gff3/gencode.v19.annotation.gff3_all_known_genes.txt.sorted.formatted.bed.gz')

df['genes'] = df.apply(lambda x: gencode_all_known_genes(x[['chr', 'start', 'end']], gencode_v19), axis=1)
df.head()

输出样式1

df.to_csv(os.path.join(input_path,'out_df.tsv'), index=False, header = True, sep="\t")

转为一个基因一行的格式

# 移除无基因的染色体位置
## Remove all the intervals that do not overlap with genes
df2 = df[df['genes'] != "NA(0)"].reset_index(drop=True)
# 每个基因一行
new_rows = []
for i,r in df2.iterrows():
    g_list = r['genes'].split(";")
    for g in g_list:
        g = g.replace(" ","")
        new_rows.append(np.append(r[['chr', 'start', 'end', 'Size(Mb)', 'Nb_variants', 'Percentage_homozygosity', 'genes']].values, g))
df_perGene = pd.DataFrame()
df_perGene = df_perGene.append(pd.DataFrame(new_rows, columns=['chr', 'start', 'end', 'Size(Mb)', 'Nb_variants', 'Percentage_homozygosity', 'genes', 'gene_ID'])).reset_index().drop('index', axis=1)

df_perGene['gene_name'] = df_perGene['gene_ID'].apply(lambda x: x.split("(")[0])
df_perGene['gene_coverage'] = df_perGene['gene_ID'].apply(lambda x: x.split("(")[1].replace(")", ""))

## 移除之前的重复列 drop the genes column
df_perGene = df_perGene.drop(["genes", "gene_ID"], axis=1)
df_perGene.head()

## 输出
df_perGene.to_csv(os.path.join(input_path,'out_df_perGene.tsv'), index=False, header = True, sep="\t")

来源：https://gist.github.com/PubuduSaneth/6275f5df25868e0a8132d7e40c8d317f#file-01-format_gencode_gff3-ipynb