1. config
MONGO_URL = 'localhost'
MONGO_DB = 'iqiyi'
MONGO_TABLE = 'iqy_comments'
SERVICE_ARGS = ['--load-images=false', '--disk-cache=true']
CIRCLE = 50
import pymongo
import time
import datetime
from pyquery import PyQuery as pq
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from CommentSpider.Config import *
# db
client = pymongo.MongoClient(MONGO_URL)
db = client[MONGO_DB]
# driver
# chrome_options = webdriver.ChromeOptions()
# chrome_options.add_argument('--headless')
# chrome_options.add_argument('--disable-gpu')
# browser = webdriver.Chrome(chrome_options=chrome_options,executable_path='D:\\ProgramData\\Anaconda3\\Scripts\\chromedriver')
# browser = webdriver.Chrome()
browser = webdriver.PhantomJS(service_args=SERVICE_ARGS)
browser.set_window_size(1400, 900)
wait = WebDriverWait(browser, 15)
# time
now = datetime.date.today()
one_day = datetime.timedelta(days=1)
today = now.strftime('%m-%d')
yesterday = (now - one_day).strftime('%m-%d')
def save_to_mongo(result):
try:
if db[MONGO_TABLE].find(result).limit(1).count() == 0:
db[MONGO_TABLE].insert(result)
else:
print('comment duplicate')
except Exception:
print('mongodb error')
def get_comments(page_num):
if page_num > CIRCLE:
return
print('page_num:', page_num)
html = browser.page_source
doc = pq(html)
for i in range(5 + page_num * 20, 24 + page_num * 20):
try:
# 在这已经结束程序
item = doc.find('#qitancommonarea > div:nth-child(2) > div > div > div > '
'div:nth-child(2) > div > div:nth-child({})'.format(i))
except Exception:
return
c_time = item.find('.csPpFeed_time').text()
if '昨日' in c_time:
c_time = yesterday
elif '前' in c_time:
c_time = today
else:
c_time = c_time
comment = {
'time': c_time,
'name': item.find('.csPpFeed_master').text(),
'img_url': item.find('.csPpFeed_section > .section_hd > a > img').attr('data-lazy'),
'user_url': item.find('.csPpFeed_master').attr('href'),
'ft_love': item.find('.csPpFeed_ftLove > .ftNums').attr('data-paopao-agreecnt'),
'ft_comment': item.find('.csPpFeed_ftComment > .ftNums').attr('data-paopao-commentcnt'),
'comment': item.find('.csPpFeed_des > span').text()
}
# print(i, comment)
# 保存到mongodb + 去重
save_to_mongo(result=comment)
# 加载更多
more = wait.until(
EC.element_to_be_clickable(
(By.CSS_SELECTOR, '#qitancommonarea > div:nth-child(2) > div > div > div > div:nth-child(2)'
'> div > div.csPpFeed_list > div > div > a')
)
)
more.click()
sleep()
page_num += 1
wait.until(
EC.text_to_be_present_in_element(
(By.CSS_SELECTOR, '#qitancommonarea > div:nth-child(2) > div > div > div > div:nth-child(2)'
'> div > div.csPpFeed_list > div > div > a'), '加载更多'
)
)
get_comments(page_num=page_num)
def start(url):
print('url:', url)
browser.get(url=url)
browser.execute_script('window.scrollTo(0, document.body.scrollHeight)') # 底部
sleep()
wait.until(
EC.presence_of_element_located(
(By.CSS_SELECTOR, '#qitancommonarea > div:nth-child(2) > div > div > '
'div > div:nth-child(2) > div > div:nth-child({})'.format(24))
)
) # 等待评论加载出来
get_comments(page_num=0)
def sleep():
time.sleep(3)
def stop_vedio():
switch = wait.until(
EC.element_to_be_clickable(
(By.CSS_SELECTOR,
'#flashbox > div.pw-video > div:nth-child(5) > div.bottom-public > a.bottom-public_play > i')
)
)
switch.click()
if __name__ == '__main__':
# 400次循环
# 50次 加载更多
start(url='http://www.iqiyi.com/v_19rr6z41d8.html')