附上代码:
import json
import time
import pandas as pd
import os
from bs4 import BeautifulSoup
from pyecharts import Bar,Line,Overlap
from selenium import webdriver
os.chdir('D:/爬虫/综艺')
driver = webdriver.Chrome()
driver.maximize_window()
driver.close()
driver.switch_to.window(driver.window_handles[0])
url = 'http://movie.douban.com/tag/#/?sort=U&range=2,' \
'10&tags=2018,%E4+%B8%AD%E5%9B%BD%E5%%A4%A7%E9%99%86,%E7%BB%BC%E8%89%BA'
js = 'window.open("'+url+'")'
driver.execute_script(js)
driver.close()
***driver.switch_to.window(driver.window_handles[0])***
while True:
try:
js = "var q=document.documentElement.scrollTop=10000000"
driver.execute_script(js)
driver.find_element_by_class_name('more').click()
time.sleep(2)
except:
break
name = [k.text for k in driver.find_element_by_class_name('title')]
score = [k.text for k in driver.find_element_by_class_name('rate')]
url = [k.get_attribute('href') for k in driver.find_element_by_class_name('item')]
pd.DataFrame({'name':name, 'score':score, 'url': url}).to_excel('综艺名称.xlsx')
drama_list = pd.read_excel('综艺名称.xlsx')
driver = webdriver.Chrome()
driver.maximize_window()
driver.close()
driver.switch_to_window(driver.window_handles[0])
drama_info = pd.DataFrame(columns=['id', 'name', 'image', 'score', 'count', 'year', 'content', 'publish'])
actor_info = pd.DataFrame(columns=['name', 'url', 'drama_id', 'score', 'drama', 'rank', 'count'])
err = []
for i in range(drama_list.shapa[0]):
try:
url = drama_list['url'][i]
js = 'window.open("'+url+'")'
driver.execute_script(js)
driver.close()
driver.switch_to_window(driver.window_handles[0])
bsObj = BeautifulSoup(driver.page_source,"html.parser")
time.sleep(2)
data = json.loads(bsObj.find('script', attrs={'type':'application/ld+json'}
).contents[0].replace('','').replace('',''))
actor_name = [k['name'] for k in data['actor']]
actor_url = [k['url'] for k in data['actor']]
drama_score = data['aggregateRating']['ratingValue']
drama_count = data['aggregateRating']['ratingValue']
drama_name = data['name']
drama_genre = data['genre']
drama_image = data['image']
drama_publish = data['dataPublished']
drama_year = bsObj.find('span', attrs={"class":"year"}).text[1:5]
drama_content = bsObj.find('span',attrs={"property":"v:summary"}).text.replace('','')
drama_short = [k.text for k in bsObj.find_all('span',attrs={"class":"short"})]
drama_info = drama_info.append({'id':drama_list['url'][i],'name':drama_name,'image':drama_image,
'score':drama_score,'count':drama_count,'year':drama_year,
'content':drama_content,'short':drama_short,'publish':drama_publish},
ignore_index=True)
this_actors = pd.DataFrame({'name':actor_name,'url':actor_url,'drama_id':drama_list['url'][i],
'score':drama_score,'drama':drama_name,'rank':list(range(len(actor_name))),
'count':drama_count})
actor_info = pd.concat([actor_info,this_actors])
print(str(i))
except:
print(drama_list['name'][i])
err.append(drama_list['url'][i])
continue
报错信息:
ERROR:lml.utils:failed to import pyecharts_snapshot
Traceback (most recent call last):
File "D:\pycharm\lib\site-packages\lml\utils.py", line 43, in do_import
plugin_module = __import__(plugin_module_name)
ModuleNotFoundError: No module named 'pyecharts_snapshot'
Traceback (most recent call last):
File "E:/python/pyrequest-master/国产综艺节目爬取/zy.py", line 14, in
driver.switch_to.window(driver.window_handles[0])
File "D:\pycharm\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 726, in window_handles
return self.execute(Command.GET_WINDOW_HANDLES)['value']
File "D:\pycharm\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 321, in execute
self.error_handler.check_response(response)
File "D:\pycharm\lib\site-packages\selenium\webdriver\remote\errorhandler.py", line 242, in check_response
raise exception_class(message, screen, stacktrace)
selenium.common.exceptions.WebDriverException: Message: no such session
(Driver info: chromedriver=2.41.578737 (49da6702b16031c40d63e5618de03a32ff6c197e),platform=Windows NT 10.0.17134 x86_64)