爬取淘宝数据

爬取淘宝

今天坛子里有人说淘宝的反扒不容易爬取数据,求爬虫代码。为了积分,写了一下。确实有些特殊,课上没有讲过。有时间加到课件里。它的动态加载数据不是通过数据包的json而是通过js直接吐到页面,然后再通过js和css把吐到页面的数据显示出来的。正则的写法要特别注意效率,写简单了一个页面就要590多秒,改了一下7秒多

import requests
import re
import time
class DaoBaoSpider(object):
    def __init__(self):
        self.headers={'cookie':'自己的登录Cookie.','user-agent':'Mozilla/5.0'}
        self.url='https://s.taobao.com/search?q={}&s={}'

    def get_page(self,url):
        html = requests.get(
            url=url,
            headers=self.headers
        )
        html = html.text
        # print(html)
        pattern = re.compile(r',"raw_title":"(.*?)".*?,"detail_url":"(.*?)","view_price":"',re.S)
        goods_list = pattern.findall(html)
        goods_l=[]
        for g in goods_list:
            print(g)
            url='https:'+g[1].encode('utf-8').decode("unicode-escape")
            title=g[0]
            goods_l.append((title,url))
            print((title,url))
    def get_pageNum(self,url):
        html = requests.get(
            url=url,
            headers=self.headers
        )
        html = html.text
        pattern = re.compile(r'"pager":{"status":"show","data":{"pageSize":44,"totalPage":(.*?),".*?', re.S)
        pageNum = pattern.findall(html)[0]
        return pageNum
    def main(self):
        goods = input('搜索商品:')
        url=self.url.format(goods,0)
        pageNumber=self.get_pageNum(url)
        print(pageNumber)
        # for i in range(pageNumber):
        for i in range(1):
            url=self.url.format(goods,44*i)
            self.get_page(url)


if __name__ == '__main__':
    spider=DaoBaoSpider()
    # spider.get_page()
    start=time.time()
    spider.main()
    end=time.time()
    print('耗时:%.2f'%(end-start))

你可能感兴趣的:(爬虫,基础,python)