Python下载TCGA数据-1(自己捣鼓版本)

磕磕绊绊基本学明白了R,现在用python下载和处理(文件名对应TCGA barcode),主要是通过任务带一下自己对python的学习;

准备说明

一共3个脚本,分别为var.py,fu.py和total.py;这里其实主要参考了,TCGA官网的python examplesPython Examples - GDC Docs (cancer.gov) 。即,total.py是真正运行的脚本,var.py和fu.py是一些定义,放在相同文件夹即可;从网上看到的所有教程,当涉及到路径的时候,要有一个基本的认知,是发布者测试的时候用到的路径,并非直接适用,要改为自己的路径运行;

  • 从网上下载用到的搜索固定格式,var.py;主要用于TCGA样本的测序数据下载和临床信息下载;filters是样本测序数据下载时用到的筛选用的框架,pre和pos是临床信息下载时用到的固定格式部分,middle在后面的脚本部分有展示,是测序数据的文件名信息,即基于测序数据中的文件名在TCGA中检索获得其对应的barcode和其他临床信息;
filters = {
    "op": "and",
    "content":[
        # {
        # "op": "in",
        # "content":{
        #     "field": "cases.project.primary_site",
        #     "value": ["Lung"]
        #     }
        # },
        {
        "op": "in",
        "content":{
            "field": "cases.project.project_id",
            "value": ["TCGA-LUAD"]
            }
        },
        # {
        # "op": "in",
        # "content":{
        #     "field": "cases.demographic.gender",
        #     "value": ["female"]
        #     }
        # },
        {
        "op": "in",
        "content":{
            "field": "files.analysis.workflow_type",
            "value": ["HTSeq - Counts"]
            }
        }
    ]
}
pre = '''
{
"filters":{
"op":"in",
"content":{
"field":"files.submitter_id",
"value":[
'''
pos = '''
]
}
},
"format":"TSV",
"fields":"file_id,file_name,cases.submitter_id,cases.case_id,data_category,data_type,cases.samples.tumor_descriptor,cases.samples.tissue_type,cases.samples.sample_type,cases.samples.submitter_id,cases.samples.sample_id,cases.samples.portions.analytes.aliquots.aliquot_id,cases.samples.portions.analytes.aliquots.submitter_id",
 "size":"100"
}
'''
  • fu.py是用到的函数的定义
import tarfile
import os
import pandas as pd
def tar(fname):
    t = tarfile.open(fname + ".tar.gz", "w:gz")
    for root, dir, files in os.walk(fname):
        print(root, dir, files)
        for file in files:
            fullpath = os.path.join(root, file)
            t.add(fullpath)
    t.close()

def untar(fname, dirs):
    t = tarfile.open(fname)
    t.extractall(path = dirs) 
def readFile(filepath):    
    list = os.listdir(filepath)
    ens = pd.DataFrame()
    submitter_id = ''
    for root, dirs, files in os.walk(filepath):
        # print(root)
        for item in files: 
            if item.split('.')[-1] == 'gz':
                tmp_path = os.path.join(root,item)
                item_pre = item[:-16]
                data = pd.read_csv(tmp_path, compression='gzip',
                                    error_bad_lines=False,sep='\t',names=['ENSEMBLE',item_pre])
                ens[item_pre] = data[item_pre]
    ens['ENSEMBLE'] = data['ENSEMBLE']            
    return(ens)  
  • total.py是整体结构;Part1是数据下载部分,最终下载结果以压缩包的形式存在;Part2是数据处理部分,解压缩并将文件批量读取为一个矩阵;Part3是临床信息下载;Part4是将TCGA的barcode整合成矩阵的列名;
from va import *
from fu import *
import os
import gzip
import pandas as pd
import requests
import re
import json
##Part1 Here we download matrix of counts of TCGA-LUAD
files_endpt = "https://api.gdc.cancer.gov/files"
fields = [
"cases.submitter_id",
"file_name",
"cases.project.project_id",
"cases.project.primary_site"
]
fields = ','.join(fields)
params = {
    "filters": json.dumps(filters),
    "fields": fields,
    "format": "JSON",
    "size": "1000"
    }

response = requests.get(files_endpt, params = params)

file_uuid_list = []

# This step populates the download list with the file_ids from the previous query
for file_entry in json.loads(response.content.decode("utf-8"))["data"]["hits"]:
    file_uuid_list.append(file_entry["id"])

#file_uuid_list = file_uuid_list[51:100]
data_endpt = "https://api.gdc.cancer.gov/data"

params = {"ids": file_uuid_list}

response = requests.post(data_endpt, data = json.dumps(params), headers = {"Content-Type": "application/json"})

response_head_cd = response.headers["Content-Disposition"]
path = 'D:/tmp/'

file_name = path + 'TCGA-LUAD.tar.gz'
with open(file_name, "wb") as output_file:
    output_file.write(response.content)
######Part2 Decompress and read files
f_gz = 'D:/tmp/TCGA-LUAD.tar.gz'
path1 = 'D:/tmp/tcga'
####untar是自定义的函数,把下载的tar.gz文件解压缩,readFile即把解压缩的文件夹里所有文件,保留每个文件的第二列(数据),index是文件的第一列,留一个即可,整合至一个文件;
untar(fname=f_gz,dirs=path1)
a = readFile('D:/tmp/tcga')
######Part3 download clinic info          
path = 'D:/tmp/'
file_name = path + 'info.txt'
middle = pd.DataFrame(a.columns[:-1])
###middle是需要查询的临床信息对应的文件名,即files.submitter_id,是文件名加上_count
middle = '"'+ middle +'_count"'
middle = ',\n'.join(middle[0])
###whole是临床信息下载时用到的全部字段
whole = pre+str(middle)+pos
with open(file_name, "wb") as output_file:
    output_file.write(bytes(whole.encode()))
url = 'https://api.gdc.cancer.gov/files'
payload = open("D:/tmp/info.txt")
headers = {'Content-type': 'application/json', 'Accept-Charset': 'UTF-8'}
r = requests.post(url, data=payload, headers=headers)
path = 'D:/tmp/'
file_name = path + 'clinic.txt'
with open(file_name, "wb") as output_file:
    output_file.write(r.text.encode())
#####Part4 process the clinic and matrix
a.index = a.ENSEMBLE.tolist()
a1 = pd.DataFrame(a.values.T,index=a.columns, columns=a.index)
clinic = pd.read_table(file_name)
clinic.file_name = [re.sub('\.ht.+$','',x) for x in clinic.file_name]
df = pd.merge(clinic,a1,left_on='file_name',right_index = True)
df.index = df.loc[:,'cases.0.submitter_id'].tolist()
df_data = df.iloc[:,[('ENS' in x ) for x in df.columns]]
df_data.to_csv('D:/tmp/df.csv')

你可能感兴趣的:(Python下载TCGA数据-1(自己捣鼓版本))