爬虫实战——客路商品id爬取

本次目标是将韩国所有商品id导出至csv

源代码

import requests
from lxml import etree
import csv

headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36"
}

klook_id = []


def get_id(city_id, city_name):
    presence = 1
    while presence == 1:
        url_seoul = 'https://www.klook.com/zh-CN/city/'+str(city_id)+'-'+city_name
        for j in range(1, 23):
            payload = {
                'city_id': str(city_id),
                'page': str(j)
            }
            res = requests.get(url_seoul, params=payload, headers=headers).text
            if res:
                html = etree.HTML(res)
                for i in range(1, 16):
                    kr_id = html.xpath('//*[@id="filter-card-content"]/div[' + str(i) + ']/a/@data-id')
                    if kr_id:
                        if city_name == 'seoul':
                            klook_id.append([kr_id[0], 'shouer'])
                        if city_name == 'jeju':
                            klook_id.append([kr_id[0], 'jizhoudao'])
                        if city_name == 'busan':
                            klook_id.append([kr_id[0], 'fushan'])
                        if city_name == 'gangwon-do':
                            klook_id.append([kr_id[0], 'jiangyuandao'])
                        if city_name == 'gyeonggi-do':
                            klook_id.append([kr_id[0], 'jingjidao'])
                        if city_name == 'incheon':
                            klook_id.append([kr_id[0], 'renchuan'])
                    else:
                        presence = 0


get_id(13, 'seoul')
get_id(18, 'jeju')
get_id(46, 'busan')
get_id(156, 'gangwon-do')
get_id(157, 'gyeonggi-do')
get_id(158, 'incheon')

kr_list = []
for i in range(0, len(klook_id)):
    kr_list.append([klook_id[i][0], 'kr', klook_id[i][1]])

title_list = ["kr_id", "Na", "city"]
with open("kr_id.csv", "w", newline='') as t:
    writer = csv.writer(t)
    writer.writerow(title_list)
    writer.writerows(kr_list)

你可能感兴趣的:(爬虫实战——客路商品id爬取)