python:爬取某个微博指定时间段的数据

成果:爬下2017年4月份到2018年8月份的个人原创微博数据(微博内容,发微博时间,设备及该条微博网址)


image.png
#start_chrome --> input-date --> scroll_down --> find_cards_info -->save -->find_next

from  selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import csv
import os

def start_chrome():
    driver  = webdriver.Chrome(executable_path='./chromedriver')
    #打开chrome客户端
    driver.start_client()
    return driver

def q(st,et):
    return f'?is_ori=1&key_word=&start_time={st}&end_time={et}&is_search=1&is_searchadv=1#_0'

def scroll_down():
    #定位页面
    html_page = driver.find_element_by_tag_name('html')
    for i in range(15):
        print(i)
        #模拟发送END按键(网页滑到底)
        html_page.send_keys(Keys.END)
        #间隔时间0.6秒
        time.sleep(0.6)

def find_cards_info():

    #单条微博区块,CSS路径
    cards_sel  = 'div.WB_feed_detail'
    #定位到单条微博区块
    cards      = driver.find_elements_by_css_selector(cards_sel)
    info_list  = []
    for card in cards :
        content_sel = 'div.WB_text.W_f14'
        time_sel    = 'div.WB_from.S_txt2'
        link_sel    = 'div.WB_from.S_txt2>a:nth-child(1)'

        content     = card.find_element_by_css_selector(content_sel).text
        time        = card.find_element_by_css_selector(time_sel).text
        link        = card.find_element_by_css_selector(link_sel).get_attribute('href')
        info_list.append([content,time,link])
    # int_sel = 'span>span.line.S_line1>span>em:nth-child(2)'
    #
    # most = driver.find_elements_by_css_selector(int_sel)
    # rep,comm,like = [str(el.text) for el in most[1:]]
    # print(most)
    # print(rep)


    return info_list

def find_next():
    next_sel  = 'a.page.next'
    next_page = driver.find_elements_by_css_selector(next_sel)
    if next_page:
        return next_page[0].get_attribute('href')

def save(info_list,name):
    full_path = './'+name + '.csv'
    #如果当前目录下已有该名字的文件,则在该文件中添加数据;没有则重新创建并添加数据
    if os.path.exists(full_path):
        with open(full_path,'a',newline='',encoding='utf-8') as f :
            writer = csv.writer(f)
            writer.writerows(info_list)
            print('Done!')
    else:
        with open(full_path,'w+',newline='',encoding='utf-8') as f :
            writer = csv.writer(f)
            writer.writerows(info_list)
            print('Done!!')

def run_crawler(base,duration ):
    if not base.endswith('feedtop'):
        st,et = duration.split('~')
        #第一个为开始时间,第二个为结束时间
        driver.get(base+q(st,et))
    else:
        driver.get(base)
    time.sleep(5)
    scroll_down()
    time.sleep(5)
    info_list=find_cards_info()
    save(info_list,duration)
    next_page=find_next()
    if next_page:
        run_crawler(next_page,duration)

#个人微博主页
base   = 'https://weibo.com/p/1005055793083716'
driver = start_chrome()
input()
#时间可替换
run_crawler(base,'2017-04-20~2018-08-10')

你可能感兴趣的:(python:爬取某个微博指定时间段的数据)