网页分析

通过源代码和Network分析,发现返回的卡牌是用post请求的json文件

请求数据为:

cardClass: hunter  #卡的类型:职业

keywords:

standard: 1

t: 1576286199445 #时间串

cardSet: #费用

p: 1#页码(每页8张卡)

导入全部所需包

import urllib.request

import urllib.parse

import jsonpath

import json

import os

import time

请求并解析全部js

cardClasses=["druid","hunter","mage","paladin","priest","rogue","shaman","warlock","warrior","neutral"]

 

lushi_urls = 'https://hs.blizzard.cn/action/cards/query'

headers = {

    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'

}

 

for cardClass in cardClasses:

    

function(){ //XM http://www.hantecglobal.org.cn/


    if not os.path.exists(cardClass):

        os.mkdir(cardClass)

    print("开始爬{}".format(cardClass))

    for p in range(1,60):

        print(p)

        try:

            lushi_urls = 'https://hs.blizzard.cn/action/cards/query'

            headers = {

                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'

            }

            data = {

                'cost': '',

                'cardClass': cardClass,

                'keywords': '',

                'standard': '1',

                't': int(time.time()),

                'cardSet': '',

                'p': p

            }

            data = urllib.parse.urlencode(data).encode("utf-8")

            request = urllib.request.Request(url=lushi_urls, headers=headers, data=data)

            response = urllib.request.urlopen(request)

            content = response.read().decode('utf-8')

            jsondict = json.loads(content)

            card_names = jsonpath.jsonpath(jsondict, '$..cards..name')

            card_pics = jsonpath.jsonpath(jsondict, '$..cards..pic')

创建目录并保存文件

for i in range(len(card_names)):

card_path=cardClass+"/"+card_names[i]+".png"

 urllib.request.urlretrieve(url=card_pics[i], filename=card_path)

完整全代码

import urllib.request

import urllib.parse

import jsonpath

import json

import os

import time

 

print("努力成为爬虫大神")

timestart=time.time()

cardClasses=["druid","hunter","mage","paladin","priest","rogue","shaman","warlock","warrior","neutral"]

 

lushi_urls = 'https://hs.blizzard.cn/action/cards/query'

headers = {

   'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'

}

 

for cardClass in cardClasses:

   if not os.path.exists(cardClass):

       os.mkdir(cardClass)

   print("开始爬{}".format(cardClass))

   for p in range(1,60):

       print(p)

       try:

           lushi_urls = 'https://hs.blizzard.cn/action/cards/query'

           headers = {

               'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'

           }

           data = {

               'cost': '',

               'cardClass': cardClass,

               'keywords': '',

               'standard': '1',

               't': int(time.time()),

               'cardSet': '',

               'p': p

           }

           data = urllib.parse.urlencode(data).encode("utf-8")

           request = urllib.request.Request(url=lushi_urls, headers=headers, data=data)

           response = urllib.request.urlopen(request)

           content = response.read().decode('utf-8')

           jsondict = json.loads(content)

           card_names = jsonpath.jsonpath(jsondict, '$..cards..name')

           card_pics = jsonpath.jsonpath(jsondict, '$..cards..pic')

           for i in range(len(card_names)):

               card_path=cardClass+"/"+card_names[i]+".png"

               urllib.request.urlretrieve(url=card_pics[i], filename=card_path)

       except:

           continue

timeend=time.time()

print("一共用时:{}".format(timeend-timestart))

 

import osimport requestsfrom bs4 import BeautifulSoupimport time# 发送请求def send():

    r = requests.get(url=base_url)

    # 设置编码防止乱码

    r.encoding ="GBK";

    content = r.text

    parseAndSave(content)# 解析页面和保存数据def parseAndSave(html):

    soup = BeautifulSoup(html, 'lxml')

    ulList = soup.find_all('ul', attrs={'class': 'kzlist'})

    # print(ulList);

    for ul in ulList:

        li = ul.find_all('li');

        for item in li:

            name  = item.find('img').next_sibling

            obtain_method  = item.find('a').find('p').text

            rootDir = os.getcwd()

            if not os.path.exists(name):

                os.mkdir(name);

                os.chdir(name);

                src = item.find('a').find('img')['src']

                pic = requests.get(src)

                with open('pic.jpg', 'wb') as fw:

                      fw.write(pic.content)

                with open('info.txt', 'a+') as fw:

                      fw.write(name+'\n')

                      fw.write(obtain_method)

                os.chdir(rootDir);def main():

    start_time = time.time()

    send()

    end_time = time.time()

    print('程序用时:',(end_time - start_time))if __name__ == '__main__':

    cardList = []

    main()