猫眼电影用senlenuim爬取
senlenuim拿到网页源码——page_source
from selenium import webdriver
from selenium.webdriver import ActionChains
import time
driver = webdriver.Chrome()
driver.get('https://www.baidu.com/')
html = driver.page_source
print(html)
find()查看html中字符串是否存在
print(driver.page_source.find('kw'))
get_attribute——定位某一标签后查看某一属性值,获取节点属性值
猫眼电影中,找到海报图片链接,是在a标签里面的src
driver.get('https://maoyan.com/board/4')
time.sleep(1)
img_tag =driver.find_element_by_xpath('//*[@id="app"]/div/div/div[1]/dl/dd[1]/a/img[2]')
print(img_tag.get_attribute('src'))
爬取猫眼电影top100
分析:数据在dl里面,一个dd标签就是一个电影数据
from selenium import webdriver
import csv
driver =webdriver.Chrome()
driver.get('https://maoyan.com/board/4')
def get_one_page():
dd_lst =driver.find_elements_by_xpath('//*[@id="app"]/div/div/div[1]/dl/dd')
dict ={
}
for dd in dd_lst:
one_film_info_lst =dd.text.split('\n')
item ={
}
try:
item['rank'] =one_film_info_lst[0].strip()
item['name'] = one_film_info_lst[1].strip()
item['actor'] = one_film_info_lst[2].strip()
item['time'] = one_film_info_lst[3].strip()
item['score'] = one_film_info_lst[4].strip()
except:
pass
print(item,type(item))
return item
while True:
get_one_page()
try:
driver.find_element_by_link_text('下一页').click()
except Exception as e:
driver.quit()
break
设置无界面设置
from selenium import webdriver
options =webdriver.ChromeOptions()
options.add_argument('--headless')
driver =webdriver.Chrome(options=options)
爬取jd商品
下拉框拉条设置——拖动大最下面,解决ajax动态加载——execute_script(
‘window.scrollTo(0,document.body.scrollHeight)’
)
self.driver.execute_script(
'window.scrollTo(0,document.body.scrollHeight)'
)
价格的xpath
翻页处理——最后一页div属性是pn-next disabled,如果是其他页数(不是最后一页,就是pn-next)
用find()来处理——如果不是最后一页就继续点下一页(返回-1),如果是最后一页就break
代码呈现
from selenium import webdriver
import time
class JdSpider():
def __init__(self):
options =webdriver.ChromeOptions()
options.add_argument('--headless')
self.driver =webdriver.Chrome(options=options)
self.driver.get('https://www.jd.com/')
self.driver.find_element_by_xpath('//*[@id="key"]').send_keys('爬虫书')
time.sleep(1)
self.driver.find_element_by_xpath('//*[@id="search"]/div/div[2]/button').click()
time.sleep(1)
def parse_html(self):
self.driver.execute_script(
'window.scrollTo(0,document.body.scrollHeight)'
)
time.sleep(3)
li_lst =self.driver.find_elements_by_xpath('//*[@id="J_goodsList"]/ul/li')
for li in li_lst:
try:
item ={
}
item['price'] =li.find_element_by_xpath('.//div[@class="p-price"]/strong').text.strip()
item['name'] = li.find_element_by_xpath('.//div[@class="p-name"]/a/em').text.strip()
item['price'] = li.find_element_by_xpath('.//div[@class="p-commit"]/strong').text.strip()
item['price'] = li.find_element_by_xpath('.//div[@class="p-shopnum"]/a').text.strip()
print(item)
except Exception as e:
print(e)
def main(self):
while True:
self.parse_html()
if self.driver.page_source.find('pn-next disabled') == -1:
self.driver.find_element_by_xpath('//*[@id="J_bottomPage"]/span[1]/a[9]').click()
time.sleep(1)
else:
self.driver.quit()
break
if __name__ == '__main__':
spider =JdSpider()
spider.main()