python - selenium 抓取‘楚乔传’ 评论

1. config

MONGO_URL = 'localhost'
MONGO_DB = 'iqiyi'
MONGO_TABLE = 'iqy_comments'


SERVICE_ARGS = ['--load-images=false', '--disk-cache=true']

CIRCLE = 50

2. spider

import pymongo
import time
import datetime

from pyquery import PyQuery as pq

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from CommentSpider.Config import *

# db
client = pymongo.MongoClient(MONGO_URL)
db = client[MONGO_DB]

# driver
# chrome_options = webdriver.ChromeOptions()
# chrome_options.add_argument('--headless')
# chrome_options.add_argument('--disable-gpu')
# browser = webdriver.Chrome(chrome_options=chrome_options,executable_path='D:\\ProgramData\\Anaconda3\\Scripts\\chromedriver')

# browser = webdriver.Chrome()

browser = webdriver.PhantomJS(service_args=SERVICE_ARGS)
browser.set_window_size(1400, 900)

wait = WebDriverWait(browser, 15)

# time
now = datetime.date.today()
one_day = datetime.timedelta(days=1)
today = now.strftime('%m-%d')
yesterday = (now - one_day).strftime('%m-%d')

def save_to_mongo(result):
    try:
        if db[MONGO_TABLE].find(result).limit(1).count() == 0:
            db[MONGO_TABLE].insert(result)
        else:
            print('comment duplicate')
    except Exception:
        print('mongodb error')

def get_comments(page_num):
    if page_num > CIRCLE:
        return
    print('page_num:', page_num)
    html = browser.page_source
    doc = pq(html)
    for i in range(5 + page_num * 20, 24 + page_num * 20):
        try:
            #  在这已经结束程序
            item = doc.find('#qitancommonarea > div:nth-child(2) > div > div > div > '
                            'div:nth-child(2) > div > div:nth-child({})'.format(i))
        except Exception:
            return

        c_time = item.find('.csPpFeed_time').text()
        if '昨日' in c_time:
            c_time = yesterday
        elif '前' in c_time:
            c_time = today
        else:
            c_time = c_time

        comment = {
            'time': c_time,
            'name': item.find('.csPpFeed_master').text(),
            'img_url': item.find('.csPpFeed_section > .section_hd > a > img').attr('data-lazy'),
            'user_url': item.find('.csPpFeed_master').attr('href'),
            'ft_love': item.find('.csPpFeed_ftLove > .ftNums').attr('data-paopao-agreecnt'),
            'ft_comment': item.find('.csPpFeed_ftComment > .ftNums').attr('data-paopao-commentcnt'),
            'comment': item.find('.csPpFeed_des > span').text()
        }
        # print(i, comment)
        # 保存到mongodb + 去重
        save_to_mongo(result=comment)

    # 加载更多
    more = wait.until(
        EC.element_to_be_clickable(
            (By.CSS_SELECTOR, '#qitancommonarea > div:nth-child(2) > div > div > div > div:nth-child(2)'
                              '> div > div.csPpFeed_list > div > div > a')
        )
    )
    more.click()
    sleep()
    page_num += 1
    wait.until(
        EC.text_to_be_present_in_element(
            (By.CSS_SELECTOR, '#qitancommonarea > div:nth-child(2) > div > div > div > div:nth-child(2)'
                              '> div > div.csPpFeed_list > div > div > a'), '加载更多'
        )
    )
    get_comments(page_num=page_num)


def start(url):
    print('url:', url)
    browser.get(url=url)

    browser.execute_script('window.scrollTo(0, document.body.scrollHeight)')  # 底部
    sleep()
    wait.until(
        EC.presence_of_element_located(
            (By.CSS_SELECTOR, '#qitancommonarea > div:nth-child(2) > div > div > '
                              'div > div:nth-child(2) > div > div:nth-child({})'.format(24))
        )
    )  # 等待评论加载出来
    get_comments(page_num=0)

def sleep():
    time.sleep(3)

def stop_vedio():
    switch = wait.until(
        EC.element_to_be_clickable(
            (By.CSS_SELECTOR,
             '#flashbox > div.pw-video > div:nth-child(5) > div.bottom-public > a.bottom-public_play > i')
        )
    )
    switch.click()

if __name__ == '__main__':
    # 400次循环
    # 50次 加载更多
    start(url='http://www.iqiyi.com/v_19rr6z41d8.html')


你可能感兴趣的:(Python,数据采集,selenium,python)