爬虫实战——客路商品目录爬取

本次目标是爬取商品名称、售价、促销价以及简介导出至csv,并将商品封面保存

源代码

import requests
from lxml import etree
import csv

thing_list = []
thing_id = 0
headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"
}
for p in range(1, 21):
    url = 'https://www.klook.com/zh-CN/city/13-seoul/?city_id=13&limit=15&template_ids=&tag_ids=&instant=0&sort=&page='+str(p)
    res = requests.get(url, headers=headers).text
    selector = etree.HTML(res)
    for n in range(1, 16):
        thing_id = thing_id+1
        thing_name = selector.xpath('//*[@id="filter-card-content"]/div['+str(n)+']/a/ul/li[1]/h3/text()')
        thing_price_now = selector.xpath('//*[@id="filter-card-content"]/div['+str(n)+']/a/ul/li[3]/p[2]/span[1]/b/text()')
        thing_price_pre = selector.xpath('//*[@id="filter-card-content"]/div['+str(n)+']/a/ul/li[3]/p[1]/del/text()')
        if thing_price_pre == []:
            th_price_pre = thing_price_now[0].strip()
            th_price_now = ''
        else:
            th_price_pre = "¥ " + thing_price_pre[0].strip()
            th_price_now = thing_price_now[0].strip()
        thing_img = selector.xpath('//*[@id="filter-card-content"]/div['+str(n)+']/a/div/@data-original')
        thing_site = selector.xpath('//*[@id="filter-card-content"]/div['+str(n)+']/a/@href')
        thing_url = "https://www.klook.com" + thing_site[0]
        thing_res = requests.get(thing_url, headers=headers).text
        selector_th = etree.HTML(thing_res)
        thing_intro = selector_th.xpath('//*[@id="description"]/div[1]/p/text()')
        if thing_intro == []:
            th_intro = ''
        else:
            th_intro = thing_intro[0]
        thing_list.append([thing_name[0], thing_id, str(th_price_now).replace(',', ''), str(th_price_pre).replace(',', ''), th_intro, thing_img[0]])
        pic = requests.get(thing_img[0])
        with open(str(thing_id) + '.jpg', 'wb') as file:
            file.write(pic.content)

title_list = ["name", "id", "price_now", "price_pre", "intro"]
with open("thing.csv", "w", newline='') as t:
    writer = csv.writer(t)
    writer.writerow(title_list)
    writer.writerows(thing_list)

你可能感兴趣的:(爬虫实战——客路商品目录爬取)