selenium可以用来做自动化测试也可以拿来开发爬虫,因为selenium的网页是已经渲染过的,可以配合browsermob来嗅探视频
可以用chrome和firefox启动,我推荐firefox,因为firefox的profile好用多了。
普通的启动如下:
from selenium import webdriver
driver = webdriver.Chrome()
driver.get('http://www.baidu.com')
#或者
driver=webdriver.Firefox()
driver.get('http://www.baidu.com')
这样启动的浏览器是新的,没有任何的cookie信息,当初各种尝试导入cookie,直到我看到profile,火狐的profile目录在帮助--》故障排除信息--》配置文件--》根目录
profile=webdriver.FirefoxProfile(r'C:\Users\yearEamab\AppData\Roaming\Mozilla\Firefox\Profiles\q95dlwy9.default')
driver = webdriver.Firefox(profile)
option = webdriver.ChromeOptions()
option.add_argument(r'--user-data-dir=C:\Users\yearEamab\AppData\Local\Google\Chrome\User Data') #设置成用户自己的数据目录
driver = webdriver.Chrome(chrome_options=option)
driver.maximize_window()#可以调整窗口大小
driver.implicitly_wait(5)
driver.get("https://weibo.com")
#打开新浪科技微博将第一条转发
driver.get("https://weibo.com/sinatech?is_all=1")
driver.find_element_by_xpath('//*[@id="Pl_Official_MyProfileFeed__27"]/div[1]/div[2]/div[2]/div/ul/li[2]/a').click()
time.sleep(15)
driver.find_element_by_xpath('//body/div[9]/div[2]/div[3]/div/div[2]/div/div[2]/div/div/div/div/textarea').clear()
driver.find_element_by_xpath('//body/div[9]/div[2]/div[3]/div/div[2]/div/div[2]/div/div/div/div/textarea').send_keys('厉害啊')
driver.find_element_by_xpath('//body/div[9]/div[2]/div[3]/div/div[2]/div/div[2]/div/div/div/div[2]/div/a').click()
time.sleep(5)
xpath的路径可以用检查里面的copyxpath简直方便,一开始我还去数,无语
我还做了个b站的弹幕发送,但是由于b站的设置,每发一次的时间就变长,只能发个几次而已
from selenium import webdriver
import time
def send_danmu(vedio_url,num):
profile_url=r'C:\Users\yearEamab\AppData\Roaming\Mozilla\Firefox\Profiles\q95dlwy9.default'
profile=webdriver.FirefoxProfile(profile_url)
driver=webdriver.Firefox(profile)
driver.implicitly_wait(5)
time.sleep(5)
driver.get(vedio_url)
time.sleep(20)
driver.implicitly_wait(10)
frame=driver.find_element_by_xpath('//body/div[4]/div/div/div[2]/iframe')
element=driver.find_element_by_class_name('v-title')
print(element.text)
driver.switch_to.frame(frame)
# for x in range(0,num):
# #//body/div[4]/div/div/div[2]/iframe//html/body/div/div/div/div[4]/div[3]/input
# #//body/div[4]/div/div/div[2]/iframe//html/body/div/div/div/div[4]/div[3]/div[2]
driver.find_element_by_xpath('//html/body/div/div/div/div[4]/div[3]/input').click()
driver.find_element_by_xpath('//html/body/div/div/div/div[4]/div[3]/input').send_keys('这是弹幕')
time.sleep(5)
driver.find_element_by_xpath('//html/body/div/div/div/div[4]/div[3]/div[3]').click()
# time.sleep(10)
driver.switch_to.default_content()
if __name__=="__main__":
vedio_url=r'https://bangumi.bilibili.com/anime/6159/play#113891'
num=10
send_danmu(vedio_url,num)
frame=driver.find_element_by_xpath('//body/div[4]/div/div/div[2]/iframe')
driver.switch_to.frame(frame)#将工作空间转到iframe里
要记得转dirver回
driver.switch_to.default_content()
安装scrapy是出现了request visual c++ 14.0 于是我就去http://www.lfd.uci.edu/~gohlke/pythonlibs/#wordcloud下了twiste的whl文件,安装成功,一开始时将获取的数据输出到外面时显示的是unicode编码,后面google了才找到答案在settings文件里加FEED_EXPORT_ENCODING='utf-8‘即可scrapy startproject new_spider新建项目
scrapy crawl spider_name -o outputfile_name(可以是json,csv,xml)启动爬虫
scrapy runspider 也可以启动爬虫但是要到爬虫的目录下
with open('article.txt','wb') as f:
f.write(response.body)
f.close()
此时response.body返回的bytes的所以要用wb才行
with open('article_title.txt','a',encoding='utf-8') as f:
f.write(one_article.css('.title a::text').extract()[0]+' ')
f.write(response.urljoin(one_article.css('.title a::attr(href)').extract()[0])+'\n')
f.close()
此时用选择器返回的却是str所以用w或a
可以看到response.url是str而response.body是bytes,搞不懂,下面的是返回segmentfault的第一页的所有的文章和链接,response.urljoin()则会自动补齐url
# -*- coding: utf-8 -*-
import scrapy
class SegmentfaultSpiderSpider(scrapy.Spider):
name = 'segmentfault_spider'
allowed_domains = ['segmentfault.com']
start_urls = ['http://segmentfault.com/']
def parse(self, response):
# with open('article.txt','wb') as f:
# f.write(response.body)
# f.close()
article_list=response.css('.stream-list__item')
for one_article in article_list:
with open('article_title.txt','a',encoding='utf-8') as f:
f.write(one_article.css('.title a::text').extract()[0]+' ')
f.write(response.urljoin(one_article.css('.title a::attr(href)').extract()[0])+'\n')
f.close()