2020-04-28 爬虫一个网站的内容到另一个网站进行搜索

根据中国斑马鱼水生所对1号染色体所有基因进行敲除,想获取只和线粒体相关的斑马鱼敲除品系。
做法:打算先爬虫得到多有一号染色体上的基因,然后在将每一个基因输入到Genecard网站上,爬虫关于线粒体功能的内容,查找基因是否与线粒体相关,然后输出表格。
其他的内容爬取成功,但是下载图片没有成功

import requests
from lxml import etree
import os
from urllib.request import urlopen
from urllib import request
import time
#keyword=input('请输入想要查询的项目')
url='http://www.zfish.cn/TargetList.aspx'#中国斑马鱼水生所
r = requests.get(url)   #申请访问页面
r.encoding = 'utf-8'   #解码方式
root = etree.HTML(r.content)  #采用xpath爬取
items = root.xpath('//*[@class="nei_content"]/div/table[2]/tr')
number=root.xpath('//*[@id="AspNetPager1"]/div[1]/text()')
pages = int(str(number).strip().split('页')[0].split('共')[-1])
print(items)
print(pages)
Zkonumeber=[]
ALL_gene=[]
for page in range(pages):
    if page > 64:
        break
    url = 'http://www.zfish.cn/TargetList.aspx?page=' + str(page + 1)#每页的网址是否有规律,解码每一页
    r=requests.get(url)
    r.encoding='utf-8'
    root=etree.HTML(r.content)
    time.sleep(3)  # 不然会load不完整
    goods = root.xpath('//*[@class="nei_content"]/div/table[2]/tr')
    #print('第%d页有%d件商品' % ((page + 1), len(goods)))
    for good in goods:
        try:
            zko_numeber =good.xpath('td[1]/a/text()')
            Gene_name =good.xpath('td[3]/a/text()')
            if zko_numeber==[] or Gene_name==[]:
                continue
            else:

              #print(zko_numeber,Gene_name)
              Zkonumeber.append(zko_numeber)
              ALL_gene.append(Gene_name)
        except:
            print(good.text)
###获得所有基因名字的集合
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
    'Accept-Language': 'zh-CN,zh;q=0.9',
    'Cache-Control': 'private',
    'Connection': 'keep-alive'

}
###爬取genecards
for keyword in ALL_gene:
  try:
    url = 'https://www.genecards.org/cgi-bin/carddisp.pl?gene=' + keyword[0] + '&keywords=' + keyword[0]
    r = requests.get(url,headers=headers)
    r.encoding = 'utf-8'
    root = etree.HTML(r.content)
    words = root.xpath('//*[@id="localization"]/div[2]/div[2]/div[2]/table/tbody/tr/td[1]/text()')
    score = root.xpath('//*[@id="localization"]/div[2]/div[2]/div[2]/table/tbody/tr/td[2]/text()')
    dicta=dict(zip(words,score))
    for word in words:
      if word=='mitochondrion':
        print(keyword,word,dicta[word])
        filename = '一号染色体gene'
        if not os.path.isfile(filename):
            ##domnloading..
            content = open(filename, 'w')
            content.write(keyword.word.dicta[word].read())

  except BaseException as e:  ##设置避免出差错,运行不了
        print('Error:', e)



你可能感兴趣的:(2020-04-28 爬虫一个网站的内容到另一个网站进行搜索)