python——爬虫学习——爬取淘宝搜索结果-(5)

将requests,re库结合起来的一个爬取淘宝商品的程序:

# -*- coding: utf-8 -*
import requests
import re
import sys,locale


#淘宝爬虫类
class TAOBAO:
    #初始化,传入基地址,页数
    def __init__(self,baseUrl,page,goods):
        self.goods=goods
        self.baseUrl=baseUrl+self.goods
        self.page=page
        self.defaultgoods=u"淘宝"
        self.ilt=[]        
        self.file=None
    #传入url,获取页面代码
    def getHTMLText(self,url):
        headers={'user-agent':"Mozilla/5.0 (Windows NT 6.3; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0",'Accept-Language':"zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3"}
        try:
            r=requests.get(url,headers=headers,timeout=30)
            r.encoding=r.apparent_encoding
            return r.text
        except requests.exceptions.RequestException,e:
            print e
    #摘取所要数据    
    def parsePage(self,html):
        try:
            plt = re.findall(r'\"view_price\"\:\"[\d\.]*\"',html)
            tlt = re.findall(r'\"raw_title\"\:\".*?\"',html)
            npl = re.findall(r'\"view_sales\"\:\"[\d]*',html)
            for i in range(len(plt)):
                price = eval(plt[i].split(':',1)[1])
                title = eval(tlt[i].split(':',1)[1])
                num   = eval(npl[i].split(':"',1)[1])
                self.ilt.append([price , num, title])
        except:
            print("摘取数据出错").encode('utf-8')
            return None
    #打印数据
    def printGoodsList(self):

        tplt = "{:^4}\t{:<14}\t{:<10}\t{:^80}"
        print (tplt.format("序号", "价格","成交量", "商品名称", ).decode('utf-8'))
        count = 0
        for g in self.ilt:
            count = count + 1
            print(tplt.format(count, g[0], g[1], g[2],chr(32)))
    #创建存入数据文件
    def setFileTitle(self):

            self.file = open(goods + ".txt","w+")
    #写入数据    
    def writeData(self):
        tplt = "{:^4}\t{:<14}\t{:<10}\t{:^80}"
        self.file.write(tplt.format("序号", "价格","成交量", "商品名称", )+"\n")
        count = 0
        for g in self.ilt:
            count = count + 1
            self.file.write(tplt.format(count, g[0], g[1], g[2],chr(32))+"\n")

    def start(self):
        self.setFileTitle()
        for i in range(self.page):
            try:
                url = self.baseUrl + '&s=' + str(44*i)
                html = self.getHTMLText(url)
                self.parsePage(html)
            except:
                continue
        self.printGoodsList()
        self.writeData()

goods=raw_input("输入想要查询的物品:").decode(sys.stdin.encoding or locale.getpreferredencoding(True))
baseurl = "https://s.taobao.com/search?q="
page=int(raw_input("输入想要查询的页数:"))
tb = TAOBAO(baseurl,page,goods)
tb.start()

你可能感兴趣的:(python)