使用正则简单爬取淘宝页面信息

import requests
import re

def getHtmlText(url):
    """提取html页面,注意反爬"""
    headers = {
     'User-Agent':'自己的',
               'cookie':'自己的(注意隐私)'
    }
    try:
        resp = requests.get(url,headers=headers,timeout=30)
        resp.raise_for_status()
        resp.encoding = resp.apparent_encoding
        # print(resp.request.url)
        # print(resp.text)
        return resp.text
    except:
        print('获取页面源代码失败')

def parsePage(ilt,html):
    """使用正则提取页面信息,进入源代码查看,爬取html页面编译器可能看不到"""
    try:
        """注意转义符的处理"""
        plt = re.findall(r'\"view_price\"\:\"\d.*?\"',html)
        tlt = re.findall(r'\"raw_title\"\:\".*?\"',html)
        for i in range(len(plt)):
            price = eval(plt[i].split(':')[1])
            title = eval(tlt[i].split(':')[1])
            # print(price)
            ilt.append([price,title])
    except:
        print('正则错误')

def printList(ilt):
    """设置打印模板"""
    printlt = '{:<4}{:8}{:30}'
    print(printlt.format('序号','价格','名称'))
    count = 0
    for i in ilt:
        count += 1
        print(printlt.format(count,i[0],i[1]))

def main():
    """爬取名称为书包,深度(页面)为2的数据。使用try……except过滤异常,让程序正常运行"""
    goods = '书包'
    infor_list = []
    url = 'https://s.taobao.com/search?q={}'.format(goods)
    for i in range(1,3):
        try:
            start_url = url + '&s={}'.format(44*i)
            text =getHtmlText(start_url)
            parsePage(infor_list,text)
        except:
            continue
    printList(infor_list)

if __name__ == "__main__":
    main()

你可能感兴趣的:(python)