from time import sleep
import json
import csv
from selenium import webdriver
from bs4 import BeautifulSoup
import urllib.parse
import urllib.request
class SpiderVip(object):
def __init__(self,url,start_page,end_page,keyword):
self.url = url
self.start_page = start_page
self.end_page = end_page
self.keyword =keyword
self.headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}
# 处理url
def handle_url(self,page):
data = {
"keyword":self.keyword,
"page":page
}
data = urllib.parse.urlencode(data)
url = self.url + data
# print(url)
# 用phantomjs打开分析
driver = webdriver.PhantomJS()
driver.get(url)
sleep(10)
# 下拉网页
js = "document.body.scrollTop = '10000'"
for i in range(15):
driver.execute_script(js)
sleep(3)
# driver.save_screenshot("vip.png") # 截图
html = driver.page_source # 下载源码
return html
# 下载源码,分析数据
def download(self,html):
soup = BeautifulSoup(html,"lxml")
contents_list = soup.select(".goods-list .goods-list-item .goods-inner")
items_list = []
for content in contents_list:
items_dict = {}
# 图片地址
img_url = "http:"+content.select(".goods-image img")[0]["src"]
# 标题
goods_title = content.select("h4 a")[0].get_text()
# 折扣
goods_discount = content.select(".goods-price-info .goods-discount")[0].get_text()
# 现价
goods_price = content.select(".goods-price-info .goods-sells-price")[0].get_text()
# 原价
goods_old_price = content.select(".goods-price-info .goods-market-price ")[0].get_text()
# print(img_url)
items_dict["img_url"] = img_url
items_dict["title"] = goods_title
items_dict["discount"] = goods_discount
items_dict["price"] = goods_price
items_dict["oldPrice"] = goods_old_price
items_list.append(items_dict)
return items_list
# 保存数据
def save_goods(self,goods_data_list):
# 以json格式保存
# goods_json = json.dumps(goods_data_list)
# with open("goods.json","w",encoding="utf-8") as fp:
# fp.write(goods_json)
# fp.flush()
# 以csv格式保存
file_name = ["title","price","discount","oldPrice","img_url"]
with open("goodscsv.csv","w",encoding="utf-8",errors="ignore") as cp:
for goodsDict in goods_data_list:
goods_csv = csv.DictWriter(cp,fieldnames=file_name)
goods_csv.writerow(goodsDict)
# 对外启动
def startVip(self):
goods_all_list = []
for page in range(int(self.start_page),int(self.end_page)+1):
req = self.handle_url(page)
goods_all_list += self.download(req)
# print(goods_all_list)
# print(goods_all_list)
self.save_goods(goods_all_list)
# 主函数
def main():
url = "http://category.vip.com/suggest.php?"
keyword = input("请输入要搜索的物品: ")
start_page = input("请输入开始页: ")
end_page = input("请输入结束页: ")
# 创建一个spider对象,并开始爬取
spider = SpiderVip(url,start_page,end_page,keyword)
spider.startVip()
if __name__ == '__main__':
main()