因为有在双十一配主机的计划,所有就产生了采集京东上指定商品信息(主要是价格)的想法。花闲余时间简单学习了下Python,参考了一些其他人的爬虫代码,最终完成一个比较简单的Python爬虫。可以根据商品ID或者搜索关键字爬取商品信息。
import requests
from lxml.html import etree
import csv
import time
import datetime
def getHeader(referer):
headers = {
'authority': 'search.jd.com',
'accept': '*/*',
'method': 'GET',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36',
'x-requested-with': 'XMLHttpRequest',
'sec-fetch-site': 'same-origin',
'sec-fetch-mode': 'cors',
'sec-fetch-dest': 'empty',
'referer': referer,
'accept-language': 'zh-CN,zh;q=0.9',
}
return headers
def collectData(url,keyword):
headers = getHeader(url)
response = requests.get(url, headers=headers) # 获取数据
res_html = etree.HTML(response.text) # 解析数据
total_page = res_html.xpath('.//span[@class="fp-text"]/i/text()')[0].strip()
for i in range(1, int(total_page)*2+1):
a = time.time()
b = '%.5f' % a
params = {
'keyword': keyword,
'suggest': '1.his.0.0',
'wq': keyword,
'page': str(i),
's': str(1 + (i - 1) * 25),
'scrolling': 'y',
'log_id': str(b),
'tpl': '1_M',
'isList': '0'
}
response = requests.get('https://search.jd.com/s_new.php', headers=headers, params=params) # 获取数据
res_html = etree.HTML(response.text) # 解析数据
goods_list_items = res_html.xpath('//li[@class="gl-item"]')
num_goods = len(goods_list_items)
if num_goods == 0:
print('第{}页,这页总共{}件商品'.format(i, num_goods))
print('None')
else:
print('第{}页,这页总共{}件商品'.format(i, num_goods))
x = 0
for goods_list_item in goods_list_items:
x += 1
good_id = goods_list_item.xpath('@data-sku')[0].strip()
price_url = f'http://p.3.cn/prices/mgets?skuIds={good_id}'
try:
price_json = requests.get(price_url, headers=headers).json()
goods_now_price = price_json[0].get('p') # 现在的价格
goods_old_price = price_json[0].get('m') # 以前的价格
except:
goods_now_price = ''
goods_old_price = ''
item_url = 'https://item.jd.com/{}.html'.format(good_id)
res1 = requests.get(item_url, headers=headers)
res_item = etree.HTML(res1.text)
try:
goods_brand = res_item.xpath('//ul[@id="parameter-brand"]/li/@title')[0] # 品牌
good_name = res_item.xpath('//ul[@class="parameter2 p-parameter-list"]/li[1]/@title')[0]
except:
goods_brand = ''
good_name = ''
today = datetime.date.today()
list = [goods_brand, good_name, good_id, goods_now_price, goods_old_price, item_url, today]
print(x, list)
csv_file = open('备战双十一.csv', 'a', newline='', encoding='utf-8-sig')
writer = csv.writer(csv_file)
writer.writerow(list)
print('\n')
time.sleep(1)
csv_file.close()
def collectSingle(good_id):
item_url = 'https://item.jd.com/{}.html'.format(good_id)
headers = getHeader(item_url)
price_url = f'http://p.3.cn/prices/mgets?skuIds={good_id}'
try:
price_json = requests.get(price_url, headers=headers).json()
goods_now_price = price_json[0].get('p') # 现在的价格
goods_old_price = price_json[0].get('m') # 以前的价格
except:
goods_now_price = ''
goods_old_price = ''
res1 = requests.get(item_url, headers=headers)
res_item = etree.HTML(res1.text)
try:
goods_brand = res_item.xpath('//ul[@id="parameter-brand"]/li/@title')[0] # 品牌
good_name = res_item.xpath('//ul[@class="parameter2 p-parameter-list"]/li[1]/@title')[0]
except:
goods_brand = ''
good_name = ''
today = datetime.date.today()
list = [goods_brand, good_name, good_id, goods_now_price, goods_old_price, item_url, today]
print(list)
csv_file = open('单件商品.csv', 'a', newline='', encoding='utf-8-sig')
writer = csv.writer(csv_file)
writer.writerow(list)
csv_file.close()
主要功能就两个方法
collectSingle(good_id): 根据单件商品的商品ID取数据存入csv文件,其实就是从collectData方法剥离出的一部分简单功能
collectData(url,keyword): 根据列表页url和关键字取数据存入csv文件,url主要用于添加限制条件,要不有些关键字的检索结果实在太多了,关键字要转化成16位unicode。这里贴一个范例数据。
['https://search.jd.com/search?keyword=%E5%9B%BA%E6%80%81%E7%A1%AC%E7%9B%98m.2%E8%87%AA%E8%90%A5&wq=%E5%9B%BA%E6%80%81%E7%A1%AC%E7%9B%98m.2%E8%87%AA%E8%90%A5&ev=1948_82059%7C%7C124285%5E539_91928%5E',
'\u56fa\u6001\u786c\u76d8m.2\u81ea\u8425']
在有学习过其他编程语言的基础上,python的学习成本并不是很高。这里并没有用python的爬虫框架,只是简单的request请求然后解析html页面取数据。在摸索实践的过程中对HTTP协议、chrome调试、学习新知识等方面还是有很大的提升的。总之是一次挺有意义的学习过程。
https://blog.csdn.net/sinat_20019511/article/details/104354417
https://blog.csdn.net/weixin_48615832/article/details/107174331