基因数据处理与可视化
流程步骤:
第一步:数据预处理
第二步:数据分析
第三步:数据可视化
第一步:数据预处理
import re
import os
import time
import datetime
import sys
1.保存日志写入文件
def save_log(data,log_file_name='insects_log',path = '/home/zhangkaifu/log/'):
'''
data:save data;
log_file_name default is 'insects_log';
path default is '/home/zhangkaifu/log/;
'''
if not os.path.isdir(path):
os.makedirs(path)
out_log = open(path+log_file_name+'.txt', 'a+')
out_log.write('\n')
if type(data) is str:
out_log.write(data)
if type(data) is list:
for line in data:
out_log.write('\n'+line)
out_log.close
2.对信息数据文件进行筛选处理,保留基因最长的数据
def processing_gff(gff_file):
data= []
tag = False
tam_str = []
count = 0
for line in open(gff_file):
line_list = line.strip().split('\t')
if(len(line_list) >=8 ):
if(line_list[2].lower() == 'gene' or line_list[2].lower() == 'region' or re.match( r'(.*)RNA', line_list[2],re.IGNORECASE)):
if(line_list[2].lower() == 'gene' or line_list[2].lower() == 'region'):
tag = False
else:
if(tag == False):
tams = str(line_list[8]).split(";")
for tamz in tams:
taml = (str(tamz).split("="))
if(taml[0].lower() == 'name'):
tam = ''
tam = taml[1]
tam_str = []
tam_str.append(line_list[3])
tam_str.append(line_list[4])
tam_str.append(tam)
data.append(tam_str)
tag = True
count += 1
break
else:
tag = False
else:
if((int(line_list[4]) - int(line_list[3])) > (int(tam_str[1]) - int(tam_str[0]))):
tams = str(line_list[8]).split(";")
for tamz in tams:
taml = (str(tamz).split("="))
if(taml[0].lower() == 'name'):
tam = ''
tam = taml[1]
tam_str = []
tam_str.append(line_list[3])
tam_str.append(line_list[4])
tam_str.append(tam)
data[count - 1] = tam_str
break
print( "%s Processing information files,got %d valid data...\n"% (gff_file,len(data)))
save_log("%s Processing information files,got %d valid data...\n"% (gff_file,len(data)))
return data
3.保存信息数据
def save_gff(data,gff_out_file):
'''
save information data(.gff file)
'''
path = gff_out_file[0:gff_out_file.rfind("/")]
if not os.path.isdir(path):
os.makedirs(path)
outGFF = open(gff_out_file, 'w')
for lines in data:
strs = ''
for line in lines:
if(strs==''):
strs = line.strip()
else:
strs = strs + ';'+line.strip()
outGFF.write(strs+'\n'))
outGFF.close()
print("Finished save information data...\n")
save_log("Finished save information data\n save path:%s"%gff_out_file)
4.根据信息数据文件处理得到的数据,对基因数据文件进行筛选处理
def processing_fna(data, fna_file):
names = ''
for lines in data:
if(names is not ''):
names = names+'\t'+lines[2]
else:
names = lines[2]
data_rna = []
tags = False
counts = 0
for line in open(fna_file):
if re.match(">", line):
line_list = line.strip().split('\t')
line_list = line_list[0].split(' ')
name = line_list[0].strip(">")
name2 = name.split("|")[-1]
if(re.search(name, names) or re.search(name2, names)):
counts +=1
tags = True
else:
tags = False
if(tags):
data_rna.append(line)
print( "%s Processing gene files,got %d valid data...\n"% (fna_file,counts))
save_log("%s Processing gene files,got %d valid data...\n"% (fna_file,counts))
return data_rna
5.保存基因数据
def save_FNA(data_rna,fna_out_file):
'''
Save gene data
'''
path = fna_out_file[0:fna_out_file.rfind("/")]
if not os.path.isdir(path):
os.makedirs(path)
outFNA = open(fna_out_file, 'w')
for line in data_rna:
outFNA.write(line)
print("Save gene data completed...\n")
("Save gene data completed...\n save path:%s"%fna_out_file)
6.获取文件路径
def get_path_list(path,out_path):
gff_file_path = {}
out_gff_file_path = {}
file_path = {}
out_file_path = {}
for root, dirs, files in os.walk(path):
for file in files:
if(re.match( r'(.*).gff', file)):
gff_file_path[file.split('.gff')[0]] = (root+'/'+ file).replace('\\','/')
out_gff_file_path[file.split('.gff')[0]] = (out_path+'/gff/out_'+ file).replace('\\','/')
if(re.match( r'(.*).fna', file)):
file_path[file.split('.fna')[0]] = (root+'/'+ file).replace('\\','/')
out_file_path[file.split('.fna')[0]] = (out_path+'/transcript/out_'+ file).replace('\\','/')
return gff_file_path,out_gff_file_path,file_path,out_file_path
7.主函数
def main(path,out_path):
gff_path_list,out_gff_path_list,fna_path_list,out_fna_path_list = get_path_list(path,out_path)
print('Have %d gff and fna files....\n'%len(gff_path_list))
log = "\n\n"+str(datetime.datetime.now())+"\n data dir"+path+"\n save dir"+out_path+"\nHave %d gff and fna files....\n"%len(gff_path_list)
save_log(log)
count = 1
for key in gff_path_list:
sta_time = time.time()
print("processing %d set of data >>>>>\n"%count)
save_log("processing %d set of data >>>>>\n"%count)
gff_file = gff_path_list[key]
gff_out_file = out_gff_path_list[key]
fna_file = fna_path_list[key]
fna_out_file = out_fna_path_list[key]
try:
data = processing_gff(gff_file)
save_gff(data,gff_out_file)
data_rna = processing_fna(data, fna_file)
save_FNA(data_rna,fna_out_file)
ed_time = time.time()
print("%d set of dataa, running time:%.4f s\n"%(count,(ed_time - sta_time)))
save_log("%d set of data, running time:%.4f s\n"%(count,(ed_time - sta_time)))
except e:
save_log(e)
print(e)
continue
count += 1
8.测试
if __name__ == '__main__':
path = 'biodata/'
out_path = 'saveDate'
start_time = time.time()
main(path,out_path)
end_time = time.time()
print("Total Running time:%.4f s"%(end_time - start_time))
save_log("Total Running time:%.4f s"%(end_time - start_time))
Have 1 gff and fna files....
processing 1 set of data >>>>>
biodata/AF/gff/GCF_003254395.2_Amel_HAv3.1.gff Processing information files,got 11876 valid data...
Finished save information data...
biodata/AF/transcript/GCF_003254395.2_Amel_HAv3.1.fna Processing gene files,got 11876 valid data...
Save gene data completed...
1 set of dataa, running time:46.0196 s
Total Running time:46.0796 s
第二步:数据分析
根据数据结果进行人工分析,得出最终结论。
第三步:数据可视化
1.将所有处理得到的有效数据存入数据库。
2.通过前端页面进行数据统计等可视化展示。