以下是利用Python的requests库进行爬取采集的代码示例:
import requests
import re
import pandas as pd
def get_shop_info(shop_url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
response = requests.get(shop_url, headers=headers)
# 解析店铺页面,获取店铺信息
shop_name = re.findall(r'"user_nick":"(.*?)"', response.text)[0]
shop_rating = re.findall(r'"shopCardMainGrade":(\d+\.\d+)', response.text)[0]
return shop_name, shop_rating
def crawl_shop_list(item_url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
response = requests.get(item_url, headers=headers)
# 解析宝贝页面,获取必逛好店链接
shop_list_url = re.findall(r'"shopLink":"(.*?)"', response.text)[0]
shop_list_url = shop_list_url.replace('\\', '')
response = requests.get(shop_list_url, headers=headers)
# 解析必逛好店页面,获取前100个店铺链接
shop_links = re.findall(r'"shopLink":"(.*?)"', response.text)[:100]
shop_data = []
for i, shop_link in enumerate(shop_links):
shop_url = 'https:' + shop_link
shop_name, shop_rating = get_shop_info(shop_url)
shop_data.append([i+1, shop_name, shop_url, shop_rating, '采集时间'])
# 将数据写入Excel文件
df = pd.DataFrame(shop_data, columns=['序号', '店铺名称', '店铺地址', '比逛好店合集/淘宝神店榜热卖榜/粉丝榜/回头客榜/黑马榜', '采集时间'])
df.to_excel('shop_data.xlsx', index=False)
if __name__ == '__main__':
item_url = 'http://item.taobao.com/item.htm?id=638673761014'
crawl_shop_list(item_url)
注意事项:
item_url
为宝贝的地址,请根据需求修改。 User-Agent
可能需要根据实际情况进行调整。 shop_data.xlsx
的Excel文件中,可以根据需要修改文件名和路径。 本文由 mdnice 多平台发布