xpath爬取瓜子二手车数据

import requests
from lxml import etree
import pymysql

from fake_useragent import UserAgent

class GuaZi:

    def __init__(self):
        self.count = 1
        self.get_html()

    def get_html(self):
        page = 1
        while True:
            print("================第{}页================".format(page))
            base_url = "https://www.guazi.com/bj/buy/o{}".format(page)
			![在这里插入图片描述](https://img-blog.csdnimg.cn/20190324222221716.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3FxXzQyOTkwNzM1,size_16,color_FFFFFF,t_70)
			#请求头从访问页面按此格式截取
            headers = {
                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
                "Accept-Encoding": "gzip, deflate, br",
                "Accept-Language": "zh-CN,zh;q=0.9",
                "Connection": "keep-alive",
                "Cookie": "uuid=6be620ca-2940-4877-9e46-ab80d5630e02; antipas=872206X40812m544r24I68Y1Z3M7; cityDomain=bj; user_city_id=12; ganji_uuid=6651419019650537772954; lg=1; clueSourceCode=%2A%2300; sessionid=d9c3afac-eb32-4f92-a49f-146eb32ba313; cainfo=%7B%22ca_s%22%3A%22pz_baidu%22%2C%22ca_n%22%3A%22tbmkbturl%22%2C%22ca_medium%22%3A%22-%22%2C%22ca_term%22%3A%22-%22%2C%22ca_content%22%3A%22%22%2C%22ca_campaign%22%3A%22%22%2C%22ca_kw%22%3A%22-%22%2C%22keyword%22%3A%22-%22%2C%22ca_keywordid%22%3A%22-%22%2C%22scode%22%3A%2210103000312%22%2C%22ca_transid%22%3A%22%22%2C%22platform%22%3A%221%22%2C%22version%22%3A1%2C%22ca_i%22%3A%22-%22%2C%22ca_b%22%3A%22-%22%2C%22ca_a%22%3A%22-%22%2C%22display_finance_flag%22%3A%22-%22%2C%22client_ab%22%3A%22-%22%2C%22guid%22%3A%226be620ca-2940-4877-9e46-ab80d5630e02%22%2C%22sessionid%22%3A%22d9c3afac-eb32-4f92-a49f-146eb32ba313%22%7D; preTime=%7B%22last%22%3A1553322585%2C%22this%22%3A1553242521%2C%22pre%22%3A1553242521%7D",
                "Host": "www.guazi.com",
                "Upgrade-Insecure-Requests": "1",
                "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36"
            }

            response = requests.get(base_url, headers=headers)
            html = response.text
            # print(html)
            html_xml = etree.HTML(html)
            self.parse_htmt(html_xml)
            page += 1
            if "下一页" not in html:
                break

    def parse_htmt(self, html):
        li_xml_list = html.xpath(".//li[@data-scroll-track]")
        for li_xml in li_xml_list:
            guazi_dict = {}
            # 价格
            price = li_xml.xpath('.//a//p/text()')
            price = price[0] if price else ''
            price = price + "万"
            # print(price)
            # 描述
            descw = li_xml.xpath('.//h2/text()')
            descw = descw[0] if descw else ''
            descw = descw
            # print(descw)
            # 年限
            years = li_xml.xpath('.//div[@class="t-i"]/text()')
            years = years[0] if years else ''
            # print(years)
            # 公里数
            mileage = li_xml.xpath('.//div[@class="t-i"]/text()')
            mileage = mileage[1] if mileage else ''
            # print(mileage)
            # 图片
            images = li_xml.xpath('.//img/@src')
            images = images[0] if images else ''
            images = images.split("@")[0]

            guazi_dict["price"] = price
            guazi_dict["descw"] = descw
            guazi_dict["years"] = years
            guazi_dict["mileage"] = mileage
            guazi_dict["images"] = images
            self.save_data(self.count,price,descw,years,mileage,images) #调用储存数据库函数
            print(self.count, guazi_dict)
            self.count += 1
    def save_data(self,count,price,descw,years,mileage,images):
        mycon = None
        try:
            mydb = pymysql.connect(host='localhost', user='root', password='111111', database='test')
            mycon = mydb.cursor()
            mycon.execute(
                'create table if not EXISTS guazi(id int(9) auto_increment primary key ,price varchar(30),descw VARCHAR(255),years VARCHAR(255),mileage VARCHAR(255),images VARCHAR(255) )'
            )
            mycon.execute('select * from guazi')
            for i in mycon:  #这一步一定需要(读取数据)
                pass
                # print(i)
            # mycon.execute('insert into guazi VALUES (int(self.count),str(price),str(descw),str(year),str(mileage),str(images))')
            sql='insert into guazi(id,price,descw,years,mileage,images)VALUES (%s,%s,%s,%s,%s,%s)'
            val=[(int(count),str(price),str(descw),str(years),str(mileage),str(images))]
            mycon.executemany(sql,val)
            print(mycon.rowcount,"记录插入成功")
            mydb.commit()
        except:

            mydb.rollback()

        finally:
            if mycon:
                mycon.close()
            if mydb:
                mydb.close()

if name == ‘main’:
GuaZi()

你可能感兴趣的:(python,编程)