前几天刚好有个作业,利用蛋白质登录号在Uniprot查找并下载蛋白质序列。我进去搜了搜,发现每当点进一个新的蛋白质序列,网址都是差不多的,故想到了可以利用爬虫进行批量爬取。下面贴代码
import requests
protein_ids = ['P24950', 'P41285', 'YP_209217', 'YP_002124314', 'NP_006926', 'NP_115452', 'YP_001382257', 'YP_002213663', 'NP_008146', 'NP_116779', 'NP_008302', 'NP008315', 'NP_007094']
with open('protein_sequences1.txt', 'w') as file:
for protein_id in protein_ids:
url = f'https://www.uniprot.org/uniprot/{protein_id}.fasta'
response = requests.get(url)
if response.ok:
data = response.text
try:
protein_id = data.split('|')[1]
sequence = data[data.index('\n')+1:].replace('\n','')
file.write(f'>{protein_id}\n{sequence}\n')
except IndexError:
print(f"Unable to process protein ID: {protein_id}")
else:
print(f"Failed to retrieve data for protein ID: {protein_id}")