###本文提供的代码只用于学习,请勿用于商业用途
这次没用正则和BS4,用了XPATH,因为真的好久好久没用xpath了。
唯一要主要克服的难点就是京东的ajax加载,直接requests抓网页只有前30个商品,还要用get请求访问目标url,得到后面的商品。b变量其实是UNIX时间戳。
PS,真的好想买索尼的电视机,因为安卓电视+无开机广告......建议电视机运行内存大于2G,CPU配A73,选择安卓是为后续刷机或者装第三方应用考虑
#coded by 伊玛目的门徒
from bs4 import BeautifulSoup
import requests
# coding: utf-8
from lxml import etree
import pandas as pd
import time
start = time.clock()
header={'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.XXXX.XXX Safari/537.36'}
head={'authority': 'search.jd.com',
'method': 'GET',
'path': '/s_new.php?keyword=%E7%94%B5%E8%A7%86%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E7%94%B5%E8%A7%86%E6%9C%BA',
'scheme':'https',
'referer': 'https://search.jd.com/Search?keyword=%E7%94%B5%E8%A7%86%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E7%94%B5%E8%A7%86%E6%9C%BA&cid2=653&cid3=655&page=3&s=58&click=0',
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36',
'x-requested-with': 'XMLHttpRequest',
'Cookie':'pinId=P7U4oU1101vnxG4Eir9OvbV9-x-f3wj7; shshshfpa=3120508e-1db3-33c7-d34d-733fc57f5c61-1527256832; shshshfpb=2b679cd3d88844dc7956f01c306d1e8ea5b0817076f66fb12543702016; qrsc=3; __jdu=2045744036; TrackID=1wgah744lMsZWXTwC-q-Biyu37x7sxp8CseoInLtJZAVBG6zQJux5G9mvMGBhV3T3hMRrtlFHyNDJTPVf9ckESDMKHNDoZr-Wr6ETbyEoNGE; __jdv=122270672|baidu|-|organic|not set|1552825203464; areaId=2; PCSYCityID=2; shshshfp=f1024c8fe38dc72f7cbd01d3e8120c95; xtest=428.cf6b6759; ipLoc-djd=2-2830-51800-0; user-key=859e46c8-d778-4972-8cb6-435b9d3378b6; cn=0; 3AB9D23F7A4B3C9B=J6H7CPQOXRM4BEEC34ESMFZSPDECXYZUVXQW2BOOW2P3QJXJOXSEAMVSMD5DV3NK7MR3DZNGSK2XAS6OUUNRDR532U; __jda=122270672.2045744036.1495102986.1552825203.1552906130.51; __jdc=122270672; rkv=V0500; __jdb=122270672.2.2045744036|51.1552906130; shshshsID=8782cfd99609426bed86f079aa971f79_2_1552906303039'
}
cookies={'cookies': 'pinId=P7U4oU1101vnxG4Eir9OvbV9-x-f3wj7; shshshfpa=3120508e-1db3-33c7-d34d-733fc57f5c61-1527256832; shshshfpb=2b679cd3d88844dc7956f01c306d1e8ea5b0817076f66fb12543702016; qrsc=3; __jdu=2045744036; TrackID=1wgah744lMsZWXTwC-q-Biyu37x7sxp8CseoInLtJZAVBG6zQJux5G9mvMGBhV3T3hMRrtlFHyNDJTPVf9ckESDMKHNDoZr-Wr6ETbyEoNGE; __jda=122270672.2045744036.1495102986.1541941006.1552825203.50; __jdc=122270672; __jdv=122270672|baidu|-|organic|not set|1552825203464; areaId=2; PCSYCityID=2; shshshfp=f1024c8fe38dc72f7cbd01d3e8120c95; xtest=428.cf6b6759; rkv=V0500; 3AB9D23F7A4B3C9B=J6H7CPQOXRM4BEEC34ESMFZSPDECXYZUVXQW2BOOW2P3QJXJOXSEAMVSMD5DV3NK7MR3DZNGSK2XAS6OUUNRDR532U; ipLoc-djd=2-2830-51800-0; __jdb=122270672.4.2045744036|50.1552825203; shshshsID=99df96dc146b5071a6934e385d2d7f33_4_1552826030047'}
name=[]
p=[]
def frontpage(n):
url='https://search.jd.com/Search?keyword=电视机&enc=utf-8&qrst=1&rt=1&stop=1&page='+str(2*n-1)
#html=requests.get(url,headers=header,cookies=cookies)
html=requests.get(url,headers=head)
html.encoding = 'utf-8'
selector = etree.HTML(html.text)
goods_name = selector.xpath('//div/a/em/text()[1]')
for i in goods_name:
print (i)
name.append(i)
price = selector.xpath('//div/strong/i')
for i in price:
print (i.text)
p.append(i.text)
print (len(goods_name))
print (len(price))
x1=selector.xpath('//li[19]/div/div[2]/strong/i')
print ('###############################################')
def backpage(n):
a=time.time()
b='%.5f'%a
url_last='https://search.jd.com/s_new.php?keyword=%E7%94%B5%E8%A7%86%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E6%89%8B%E6%9C%BA&page='+str(2*n)+'&s='+str(48*n-20)+'&scrolling=y&log_id='+str(b)
url_test='https://search.jd.com/s_new.php?keyword=%E7%94%B5%E8%A7%86%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&page='+str(2*n)+'&s='+str(48*n-20)+'&scrolling=y&log_id='+str(b)
html2=requests.get(url_test,headers=head)
html2.encoding = 'utf-8'
selector = etree.HTML(html2.text)
goods_name = selector.xpath('//div/a/em/text()[1]')
for i in goods_name:
print (i)
name.append(i)
print (len(goods_name))
price = selector.xpath('//div/strong/i')
for i in price:
print (i.text)
p.append(i.text)
print (len(price))
print ('###############################################')
for n in range(1,10):
try:
frontpage(n)
except Exception as e:
print(e)
print ('fail')
print (2*n-1)
try:
backpage(n)
except Exception as e:
print(e)
print ('fail')
print (2*n)
df=pd.DataFrame({
'goods_name':name,
'price':p
})
print (df)
end = time.clock()
print ('用时: %f '%(end-start))
print('################保存csv####################')
df.to_csv('京东电视机.csv')
爬虫结果:
视频演示:
https://www.bilibili.com/video/av46769501/
还有一个我借鉴别人的chrome模拟访问方式(selenium)以实现ajax加载,但有一说一,真的很慢,建议多线程
# -*- coding: utf-8 -*-
from selenium import webdriver
from bs4 import BeautifulSoup
import time
print ('yes')
driver = webdriver.Chrome()
#s1='%E8%A3%A4%E5%AD%90'
s1='%E7%94%B5%E8%A7%86'
url='https://search.jd.com/Search?keyword='+s1+'&enc=utf-8&page=1'
driver.get(url)
print (driver.current_url)
t1 = time.time()
n = 10 #这里可以控制网页滚动距离
for i in range(1,n+1):
s = "window.scrollTo(0,document.body.scrollHeight/{0}*{1});".format(5,i)
print (s, len(driver.page_source),time.time()-t1)
driver.execute_script(s)
time.sleep(0.5)
lis=driver.find_elements_by_class_name("gl-item")
for i in range(len(lis)):
print (lis[i].find_element_by_class_name("p-name").text)
print (lis[i].find_element_by_class_name("p-price").text)
# print lis[i].text
driver.quit()
print ("end")