爬取某品的一些数据(因为页面是json动态加载,所以用了selenium + PhantomJS)

from time import sleep
import json
import csv
from selenium import webdriver
from bs4 import BeautifulSoup
import urllib.parse
import urllib.request




class SpiderVip(object):
    def __init__(self,url,start_page,end_page,keyword):
        self.url = url
        self.start_page = start_page
        self.end_page = end_page
        self.keyword =keyword
        self.headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}


    # 处理url
    def handle_url(self,page):
        data = {
            "keyword":self.keyword,
            "page":page
            }
        data = urllib.parse.urlencode(data)
        url = self.url + data
        # print(url)
        # 用phantomjs打开分析
        driver = webdriver.PhantomJS()
        driver.get(url)
        sleep(10)
        # 下拉网页
        js = "document.body.scrollTop = '10000'"
        for i in range(15):
            driver.execute_script(js)
            sleep(3)
        # driver.save_screenshot("vip.png") # 截图
        html = driver.page_source  # 下载源码
        return html




    # 下载源码,分析数据
    def download(self,html):
        soup = BeautifulSoup(html,"lxml")
        contents_list = soup.select(".goods-list .goods-list-item .goods-inner")
        items_list = []
        for content in contents_list:
            items_dict = {}
            # 图片地址
            img_url = "http:"+content.select(".goods-image img")[0]["src"]
            # 标题
            goods_title = content.select("h4 a")[0].get_text()
            # 折扣
            goods_discount = content.select(".goods-price-info .goods-discount")[0].get_text()
            # 现价
            goods_price = content.select(".goods-price-info .goods-sells-price")[0].get_text()
            # 原价
            goods_old_price = content.select(".goods-price-info .goods-market-price ")[0].get_text()
            # print(img_url)
            items_dict["img_url"] = img_url
            items_dict["title"] = goods_title
            items_dict["discount"] = goods_discount
            items_dict["price"] = goods_price
            items_dict["oldPrice"] = goods_old_price
            items_list.append(items_dict)
        return items_list




    # 保存数据
    def save_goods(self,goods_data_list):
        # 以json格式保存
        # goods_json = json.dumps(goods_data_list)
        # with open("goods.json","w",encoding="utf-8") as fp:
        #     fp.write(goods_json)
        #     fp.flush()


        # 以csv格式保存
        file_name = ["title","price","discount","oldPrice","img_url"]
        with open("goodscsv.csv","w",encoding="utf-8",errors="ignore") as cp:
            for goodsDict in goods_data_list:
                goods_csv = csv.DictWriter(cp,fieldnames=file_name)
                goods_csv.writerow(goodsDict)




    # 对外启动
    def startVip(self):
        goods_all_list = []
        for page in range(int(self.start_page),int(self.end_page)+1):
            req = self.handle_url(page)
            goods_all_list += self.download(req)
            # print(goods_all_list)
        # print(goods_all_list)
        self.save_goods(goods_all_list)



# 主函数
def main():
    url = "http://category.vip.com/suggest.php?"
    keyword = input("请输入要搜索的物品: ")
    start_page = input("请输入开始页: ")
    end_page = input("请输入结束页: ")


    # 创建一个spider对象,并开始爬取
    spider = SpiderVip(url,start_page,end_page,keyword)

    spider.startVip()



if __name__ == '__main__':
    main()











你可能感兴趣的:(python爬虫)