爬取任务是:爬取所在地为新疆的买新疆特产的店铺
代码如下:
import requests
from lxml.html import etree
import time
import csv
import json
num_url = 'https://list.tmall.com/search_product.htm?&s={}&q=%D0%C2%BD%AE&style=w'
headers = {
'authority': 'list.tmall.com',
'method': 'GET',
'path': '/search_product.htm?spm=a220m.1000858.1000724.8.3a1e1c0bSrXlCH&q=%D0%C2%BD%AE&sort=s&style=w&active=1&type=pc',
'scheme': 'https',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9',
'cookie': 'hng=CN%7Czh-CN%7CCNY%7C156; t=a29d623a8f843100e07a63fa6a9be7ba; _tb_token_=eaf9eb4138e78; cookie2=76cfb0ab9ab85229f6a9d5e7d2b72b7a; _med=dw:1920&dh:1080&pw:1920&ph:1080&ist:0; cna=Ah+LF0eH8SgCAXPI7lIwMTDC; dnk=baibo_10; tracknick=baibo_10; lid=baibo_10; lgc=baibo_10; login=true; _uab_collina=159426438552735844726944; _l_g_=Ug%3D%3D; unb=2208491329497; cookie1=UoM%2BHZ3d9KVW%2FcqOuJndb4N9gmDoxQfoATwuiyb0MCI%3D; cookie17=UUphwoPq8l6RSWmJuw%3D%3D; _nk_=baibo_10; sg=076; uc1=pas=0&cookie21=UIHiLt3xSalX&cookie14=UoTV6OOALc3vdg%3D%3D&existShop=false&cookie16=W5iHLLyFPlMGbLDwA%2BdvAGZqLg%3D%3D&cookie15=V32FPkk%2Fw0dUvg%3D%3D; uc3=id2=UUphwoPq8l6RSWmJuw%3D%3D&lg2=VT5L2FSpMGV7TQ%3D%3D&nk2=AQWXT8s4%2FnI%3D&vt3=F8dBxGJkmmCYmi1Wjmk%3D; uc4=id4=0%40U2grGRvmz3awCzNAzYX1fCs5s%2FYUElZ4&nk4=0%40A6jN89MCWyt2dd100rryB22sMg%3D%3D; sgcookie=E41wJVnTjnM13OCVb34gZ; csg=93c75414; enc=Pf99cJFaJo5hBH3muF25MsYEgGQT9ctP1rQ2NvQowi9QRbP4qGII7jjZWzVKbgpkyH7j4jmQXCpSkfigDBx1iHwS1vGuu3IBotvXiu2NDqs%3D; cq=ccp%3D0; _m_h5_tk=d8f8aed551db79c9cbcfcc67dbc4984a_1594295982376; _m_h5_tk_enc=fa5846124b4f789e21e658d8d330acbe; x5sec=7b22746d616c6c7365617263683b32223a223431636133643366303966303263366438366633636463316530386233313938435058796d2f67464549696e6e64614435364c7056786f514d6a49774f4451354d544d794f5451354e7a73794e773d3d227d; res=scroll%3A1903*10565-client%3A1903*937-offset%3A1903*10565-screen%3A1920*1080; pnm_cku822=098%23E1hvgpvUvbpvUpCkvvvvvjiPnLFUtjDRR2sUlj3mPmPUsj3ER2LWzjY8PLcOgjEhPuwCvvpvvhHh2QhvCPMMvvvCvpvVvUCvpvvvuphvmvvvpLbWvx2PkphvC9hvpyP9sbyCvm9vvhCvvvvvvvvvBGwvvUjZvvCj1Qvvv3QvvhNjvvvmmvvvBGwvvvUUmphvLU8p3gwa1COqrqpyCW2%2B%2Bfmt%2BeCBTWeARdItb9TxfXkK53hzZIeUHd8reC69D70OdiZBEH4D0Bh7%2Bul1bPoxdXkKD7UjAO97%2Bu0XjovtvpvhvvCvpv%3D%3D; l=eBQYJ3TROjBPSi-tXOfZourza77T7IRAguPzaNbMiOCP_b5p5g9CWZlyJ1Y9CnGVh62eR379TqKMBeYBqIYpBkymtBALukkmn; isg=BDMz4r_NaNyLKiQIUmq24K1vwjddaMcqRIoUluXQk9KJ5FOGbTpeeTJ2mhQKxB8i',
'referer': 'https://list.tmall.com/search_product.htm?spm=a220m.1000858.1000724.8.212b1c0bt68AVX&s=20&q=%D0%C2%BD%AE&sort=s&style=w&active=1&type=pc',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'same-origin',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent':"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36"
}
for i in range(1,6): #一共有80页,为了避免访问频率太快,5页5页的爬,且每页访问间隔3秒。但是,爬20多页的时候,就被认定为爬虫了。
try:
print(f'第{i}页')
url = num_url.format((i-1)*20)
response = requests.get(url,headers=headers)
html = etree.HTML(response.text)
items = html.xpath('//div[@class="shopCon"]/div')
for item in items:
try:
address = item.xpath('div[@class="shopHeader"]/div[@class="shopHeader-info"]/p/text()')[1].strip('所在地:')
except IndexError:
address = ''
if '新疆' in address:
store_name = item.xpath('div[@class="shopHeader"]/div[@class="shopHeader-info"]/a/text()')[0]
store_url = 'https://list.tmall.com/'+item.xpath('div[@class="shopHeader"]/div[@class="shopHeader-info"]/a/@href')[0]
xj_address = item.xpath('div[@class="shopHeader"]/div[@class="shopHeader-info"]/p/text()')[1].strip('所在地:')
print(f'{store_name}|{xj_address}|{store_url}')
list = [store_name,xj_address,store_url]
csv_file = open('新疆.csv', 'a', newline='')
writer = csv.writer(csv_file)
writer.writerow(list)
else:
pass
finally:
time.sleep(3)
csv_file.close()