import requests
from lxml import etree
import pymysql
from fake_useragent import UserAgent
class GuaZi:
def __init__(self):
self.count = 1
self.get_html()
def get_html(self):
page = 1
while True:
print("================第{}页================".format(page))
base_url = "https://www.guazi.com/bj/buy/o{}".format(page)
![在这里插入图片描述](https://img-blog.csdnimg.cn/20190324222221716.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3FxXzQyOTkwNzM1,size_16,color_FFFFFF,t_70)
#请求头从访问页面按此格式截取
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.9",
"Connection": "keep-alive",
"Cookie": "uuid=6be620ca-2940-4877-9e46-ab80d5630e02; antipas=872206X40812m544r24I68Y1Z3M7; cityDomain=bj; user_city_id=12; ganji_uuid=6651419019650537772954; lg=1; clueSourceCode=%2A%2300; sessionid=d9c3afac-eb32-4f92-a49f-146eb32ba313; cainfo=%7B%22ca_s%22%3A%22pz_baidu%22%2C%22ca_n%22%3A%22tbmkbturl%22%2C%22ca_medium%22%3A%22-%22%2C%22ca_term%22%3A%22-%22%2C%22ca_content%22%3A%22%22%2C%22ca_campaign%22%3A%22%22%2C%22ca_kw%22%3A%22-%22%2C%22keyword%22%3A%22-%22%2C%22ca_keywordid%22%3A%22-%22%2C%22scode%22%3A%2210103000312%22%2C%22ca_transid%22%3A%22%22%2C%22platform%22%3A%221%22%2C%22version%22%3A1%2C%22ca_i%22%3A%22-%22%2C%22ca_b%22%3A%22-%22%2C%22ca_a%22%3A%22-%22%2C%22display_finance_flag%22%3A%22-%22%2C%22client_ab%22%3A%22-%22%2C%22guid%22%3A%226be620ca-2940-4877-9e46-ab80d5630e02%22%2C%22sessionid%22%3A%22d9c3afac-eb32-4f92-a49f-146eb32ba313%22%7D; preTime=%7B%22last%22%3A1553322585%2C%22this%22%3A1553242521%2C%22pre%22%3A1553242521%7D",
"Host": "www.guazi.com",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36"
}
response = requests.get(base_url, headers=headers)
html = response.text
# print(html)
html_xml = etree.HTML(html)
self.parse_htmt(html_xml)
page += 1
if "下一页" not in html:
break
def parse_htmt(self, html):
li_xml_list = html.xpath(".//li[@data-scroll-track]")
for li_xml in li_xml_list:
guazi_dict = {}
# 价格
price = li_xml.xpath('.//a//p/text()')
price = price[0] if price else ''
price = price + "万"
# print(price)
# 描述
descw = li_xml.xpath('.//h2/text()')
descw = descw[0] if descw else ''
descw = descw
# print(descw)
# 年限
years = li_xml.xpath('.//div[@class="t-i"]/text()')
years = years[0] if years else ''
# print(years)
# 公里数
mileage = li_xml.xpath('.//div[@class="t-i"]/text()')
mileage = mileage[1] if mileage else ''
# print(mileage)
# 图片
images = li_xml.xpath('.//img/@src')
images = images[0] if images else ''
images = images.split("@")[0]
guazi_dict["price"] = price
guazi_dict["descw"] = descw
guazi_dict["years"] = years
guazi_dict["mileage"] = mileage
guazi_dict["images"] = images
self.save_data(self.count,price,descw,years,mileage,images) #调用储存数据库函数
print(self.count, guazi_dict)
self.count += 1
def save_data(self,count,price,descw,years,mileage,images):
mycon = None
try:
mydb = pymysql.connect(host='localhost', user='root', password='111111', database='test')
mycon = mydb.cursor()
mycon.execute(
'create table if not EXISTS guazi(id int(9) auto_increment primary key ,price varchar(30),descw VARCHAR(255),years VARCHAR(255),mileage VARCHAR(255),images VARCHAR(255) )'
)
mycon.execute('select * from guazi')
for i in mycon: #这一步一定需要(读取数据)
pass
# print(i)
# mycon.execute('insert into guazi VALUES (int(self.count),str(price),str(descw),str(year),str(mileage),str(images))')
sql='insert into guazi(id,price,descw,years,mileage,images)VALUES (%s,%s,%s,%s,%s,%s)'
val=[(int(count),str(price),str(descw),str(years),str(mileage),str(images))]
mycon.executemany(sql,val)
print(mycon.rowcount,"记录插入成功")
mydb.commit()
except:
mydb.rollback()
finally:
if mycon:
mycon.close()
if mydb:
mydb.close()
if name == ‘main’:
GuaZi()