淘宝商品数据爬取

import requests
import re

def getHTMLText(url):
    try:
        r = requests.get(url,timeout = 30)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return ""
    

def parsePage(ilt,html):     #getHTMLText抓取下来html文件,parsePage函数在html文件中提取关键信息,ilt作为存储信息的列表
    try:
        plt = re.findall(r'\"view_price\"\:\"[\d\.]*\"',html)
        tlt = re.findall(r'\"raw_title\"\:\".*?\"',html)
        for i in range(len(plt)):
            price = eval(plt[i].split(':')[1])
            title = eval(tlt[i].split(':')[1])
            ilt.append([price,title])
    except:
        print("")
    
def printGoodsList(ilt):          #打印出parsePage函数存储的信息
    tplt = "{:4}\t{:8}\t{:16}"    #规定3个字段大小
    print(tplt.format('序号','价格','商品名称'))    #先打出表头
    count = 0
    for g in ilt:   #再打印出parsePage函数存储的信息,信息都已经存储在ilt列表中,g[0]、g[1]表示在ilt列表中第一、二个字段
        count = count + 1
        print(tplt.format(count,g[0],g[1]))
        
    
    
def main():           #开始调用之前的函数
    goods='纸尿裤'
    depth=2           #抓取到第3页
    start_url = 'https://s.taobao.com/search?q=' + goods
    infoList = []
    for i in range(depth):
        try:
            url = start_url + '&s' + str(44*i)
            html = getHTMLText(url)
            parsePage(infoList,html)
        except:
            continue
    printGoodsList(infoList)
    
main()

你可能感兴趣的:(python爬虫)