基因数据处理与可视化

基因数据处理与可视化

流程步骤:

第一步:数据预处理

第二步:数据分析

第三步:数据可视化

第一步:数据预处理

import re  #正则表达库
import os  #文件目录操作
import time 
import datetime #获取系统时间
import sys      #获取命令行参数

1.保存日志写入文件

def save_log(data,log_file_name='insects_log',path = '/home/zhangkaifu/log/'):
    '''
    data:save data;
    log_file_name default is 'insects_log';
    path default is '/home/zhangkaifu/log/;
    '''
    #path = '/home/zhangkaifu/log/'
    if not os.path.isdir(path):
        os.makedirs(path)
    out_log = open(path+log_file_name+'.txt', 'a+')
    #out_log.write('\n\n')
    #out_log.write(str(datetime.datetime.now()))
    out_log.write('\n')
    if type(data) is str:
        out_log.write(data)
    if type(data) is list:
        for line in data:
            out_log.write('\n'+line)
    out_log.close

2.对信息数据文件进行筛选处理,保留基因最长的数据

# gff_file = r'biodata/GCF_003254395.2_Amel_HAv3.1_genomic.gff'
def processing_gff(gff_file):
    data= []
    tag = False  
    tam_str = []
    count = 0
    for line in open(gff_file):
        line_list = line.strip().split('\t')
        if(len(line_list) >=8 ):
            if(line_list[2].lower() ==  'gene' or line_list[2].lower() == 'region' or re.match( r'(.*)RNA', line_list[2],re.IGNORECASE)):
                if(line_list[2].lower() == 'gene' or line_list[2].lower() == 'region'):
                    tag = False
                else:
                    if(tag == False):
                        tams = str(line_list[8]).split(";")
                        for tamz in tams:
                            taml = (str(tamz).split("="))
                            if(taml[0].lower() == 'name'):
                                tam = ''
                                tam = taml[1]
                                tam_str = []
                                tam_str.append(line_list[3])
                                tam_str.append(line_list[4])
                                tam_str.append(tam)
                                data.append(tam_str)
                                tag = True
                                count += 1
                                break
                            else:
                                tag = False
                    else: 
                        if((int(line_list[4]) - int(line_list[3])) > (int(tam_str[1]) - int(tam_str[0]))):
                            tams = str(line_list[8]).split(";")
                            for tamz in tams:
                                taml = (str(tamz).split("="))
                                if(taml[0].lower() == 'name'):
                                    tam = ''
                                    tam = taml[1]
                                    tam_str = []
                                    tam_str.append(line_list[3])
                                    tam_str.append(line_list[4])
                                    tam_str.append(tam)
                                    data[count - 1] = tam_str
                                    break
    print( "%s Processing information files,got %d valid data...\n"% (gff_file,len(data)))
    save_log("%s Processing information files,got %d valid data...\n"% (gff_file,len(data)))
    return data

3.保存信息数据

def save_gff(data,gff_out_file):
    '''
    save information data(.gff file)
    '''
    path = gff_out_file[0:gff_out_file.rfind("/")]
    if not os.path.isdir(path): 
        os.makedirs(path)
    outGFF = open(gff_out_file, 'w')
    for lines in data:
        strs = ''
        for line in lines:
            if(strs==''):
                strs = line.strip()
            else:
                strs = strs + ';'+line.strip()
                
        outGFF.write(strs+'\n'))
    outGFF.close()
    print("Finished save information data...\n")
    save_log("Finished save information data\n save path:%s"%gff_out_file)

4.根据信息数据文件处理得到的数据,对基因数据文件进行筛选处理

def processing_fna(data, fna_file):
    names = ''
    for lines in data:
        if(names is not ''):
            names = names+'\t'+lines[2]
        else:
            names = lines[2]
    data_rna = []
    tags = False
    counts = 0
    for line in open(fna_file):
        if re.match(">", line):
            line_list = line.strip().split('\t')
            line_list = line_list[0].split(' ')
            name = line_list[0].strip(">")
            name2 = name.split("|")[-1]
            if(re.search(name, names) or re.search(name2, names)):
                counts +=1
                tags = True
            else:
                tags = False
        if(tags):
            data_rna.append(line)
    print( "%s Processing gene files,got %d valid data...\n"% (fna_file,counts))
    save_log("%s Processing gene files,got %d valid data...\n"% (fna_file,counts))
    return data_rna

5.保存基因数据

def save_FNA(data_rna,fna_out_file):
    '''
    Save gene data
    '''
    path = fna_out_file[0:fna_out_file.rfind("/")]
    if not os.path.isdir(path): 
        os.makedirs(path)
    outFNA = open(fna_out_file, 'w')
    for line in data_rna:
        outFNA.write(line)
    print("Save gene data completed...\n")
    ("Save gene data completed...\n save path:%s"%fna_out_file)

6.获取文件路径

def get_path_list(path,out_path):
    #path = r'../../data/'
    #out_path = r'../../project/'
    #path = input("Please input dir path of data:")
    #out_path = input("Please input dir path of save:")
#     gff_path_list = []
#     fna_path_list = []
    gff_file_path = {}
    out_gff_file_path = {}
    file_path = {}
    out_file_path = {}
    for root, dirs, files in os.walk(path):
        for file in files:
            if(re.match( r'(.*).gff', file)):
                gff_file_path[file.split('.gff')[0]] = (root+'/'+ file).replace('\\','/')
                out_gff_file_path[file.split('.gff')[0]] = (out_path+'/gff/out_'+ file).replace('\\','/')
            if(re.match( r'(.*).fna', file)):
                file_path[file.split('.fna')[0]] = (root+'/'+ file).replace('\\','/')
                out_file_path[file.split('.fna')[0]] = (out_path+'/transcript/out_'+ file).replace('\\','/')
    return gff_file_path,out_gff_file_path,file_path,out_file_path

7.主函数

def main(path,out_path):
    #get file path
    gff_path_list,out_gff_path_list,fna_path_list,out_fna_path_list = get_path_list(path,out_path)
    print('Have %d gff and fna files....\n'%len(gff_path_list))
    log = "\n\n"+str(datetime.datetime.now())+"\n data dir"+path+"\n save dir"+out_path+"\nHave %d gff and fna files....\n"%len(gff_path_list)
    save_log(log)
    count = 1
    for key in gff_path_list:
        sta_time = time.time()
        print("processing %d set of data >>>>>\n"%count)
        save_log("processing %d set of data >>>>>\n"%count)
        gff_file = gff_path_list[key]
        gff_out_file = out_gff_path_list[key]
        fna_file = fna_path_list[key]
        fna_out_file = out_fna_path_list[key]
        try:
            #1.Processing information files
            data = processing_gff(gff_file)
            #2.Save information data
            save_gff(data,gff_out_file)
            #3.Processing gene files
            data_rna = processing_fna(data, fna_file)
            #4.Save gene data
            save_FNA(data_rna,fna_out_file)
            ed_time = time.time()
            print("%d set of dataa, running time:%.4f s\n"%(count,(ed_time - sta_time)))
            save_log("%d set of data, running time:%.4f s\n"%(count,(ed_time - sta_time)))
        except e:
            save_log(e)
            print(e)
            continue
        count += 1

8.测试

if __name__ == '__main__':
#     if len(sys.argv[1:]) == 2:
#         #get file dir path and save file dir
#         path = sys.argv[1]
#         out_path = sys.argv[2]
#         start_time = time.time()
#         main(path,out_path)
#         end_time = time.time()
#         print("Total Running time:%.4f s"%(end_time - start_time))
#         save_log("Total Running time:%.4f s"%(end_time - start_time))
#     else:
#         print('\n Error in input parameters: \t Please input the correct file path: data of dir-path  save data of dir-path!')
    
    path = 'biodata/'
    out_path = 'saveDate'
    start_time = time.time()
    main(path,out_path)
    end_time = time.time()
    print("Total Running time:%.4f s"%(end_time - start_time))
    save_log("Total Running time:%.4f s"%(end_time - start_time))
Have 1 gff and fna files....

processing 1 set of data >>>>>

biodata/AF/gff/GCF_003254395.2_Amel_HAv3.1.gff Processing information files,got 11876 valid data...

Finished save information data...

biodata/AF/transcript/GCF_003254395.2_Amel_HAv3.1.fna Processing gene files,got 11876 valid data...

Save gene data completed...

1 set of dataa, running time:46.0196 s

Total Running time:46.0796 s

第二步:数据分析

根据数据结果进行人工分析,得出最终结论。

第三步:数据可视化

1.将所有处理得到的有效数据存入数据库。

2.通过前端页面进行数据统计等可视化展示。

你可能感兴趣的:(数据分析,Python)