下面的url是朋友叫我爬取的对象共青团的微博。
会爬取共青团的所有热门微博的全部的微博下的前20个一级评论
打包到一个文件夹中,每个文件都是放一个微博的前20条评论
(本来目标是拿200个微博的,但是后来就变成只有86个了…)
就是需要在打开网页的30秒内,完成微博的登录。
因为,在后面,需要爬取很多的个微博,所以,只有登录后才会拿到。
from selenium import webdriver
import time
from selenium.webdriver.common.action_chains import ActionChains
import xlwt
import os
if not os.path.exists('./Data'):
os.mkdir('./Data')
browser = webdriver.Chrome()
url = 'https://weibo.com/u/3937348351?topnav=1&wvr=6&topsug=1'
browser.get(url)
print('Waiting')
time.sleep(30)
print('Finish Waiting')
Pl_Official_MyProfileFeed__25 = browser.find_element_by_id('Pl_Official_MyProfileFeed__25')
WB_feed_WB_feed_v3_WB_feed_v4 = Pl_Official_MyProfileFeed__25.find_element_by_css_selector('div[module-type="feed"]')
divs = WB_feed_WB_feed_v3_WB_feed_v4.find_elements_by_css_selector('div[tbinfo="ouid=3937348351"]')
TIMES = 0
FLESH_TIME = 0
for div in divs:
fl_comment = div.find_element_by_css_selector('a[action-type="fl_comment"]')
ActionChains(browser).click(fl_comment).perform()
time.sleep(2)
need_approval_comment = div.find_element_by_css_selector('div[node-type="need_approval_comment"]')
list_ul = need_approval_comment.find_element_by_css_selector('div[class="list_ul"]')
_blank = list_ul.find_elements_by_css_selector('a[target="_blank"]')[-1]
ActionChains(browser).click(_blank).perform()
browser.switch_to_window(browser.window_handles[1])
time.sleep(5)
list_ul = browser.find_element_by_css_selector('div[class="list_ul"]')
comments = list_ul.find_elements_by_css_selector('div[class="WB_text"]')
count = 0
work_book = xlwt.Workbook("./Data/")
sheet = work_book.add_sheet('sheet 1')
for comment in comments:
string = comment.text
if '回复' in string:
continue
else:
# print(string)
sheet.write(count, 0, string)
count += 1
if count == 20:
break
work_book.save('./Data/%d.xls' % TIMES)
time.sleep(3)
browser.close()
browser.switch_to_window(browser.window_handles[0])
print(
' Finish', TIMES
)
TIMES += 1
FLESH_TIME += 1
if TIMES == 200:
break
if TIMES > 12:
Pl_Official_MyProfileFeed__25 = browser.find_element_by_id('Pl_Official_MyProfileFeed__25')
WB_feed_WB_feed_v3_WB_feed_v4 = Pl_Official_MyProfileFeed__25.find_element_by_css_selector(
'div[module-type="feed"]')
divs_Change = WB_feed_WB_feed_v3_WB_feed_v4.find_elements_by_css_selector('div[tbinfo="ouid=3937348351"]')
divs.extend(divs_Change[FLESH_TIME:FLESH_TIME+1])
if TIMES == len(divs):
W_pages = browser.find_elements_by_css_selector('div[class="W_pages"]')[-1]
nextPaga = W_pages.find_elements_by_tag_name('a')[-1]
# nextPaga = W_pages.find_element_by_css_selector('a[suda-uatrack="key=tblog_profile_v6&value=weibo_page"]')
ActionChains(browser).click(nextPaga).perform()
time.sleep(5)
FLESH_TIME = 0
Pl_Official_MyProfileFeed__25 = browser.find_element_by_id('Pl_Official_MyProfileFeed__25')
WB_feed_WB_feed_v3_WB_feed_v4 = Pl_Official_MyProfileFeed__25.find_element_by_css_selector(
'div[module-type="feed"]')
divs_Change = WB_feed_WB_feed_v3_WB_feed_v4.find_elements_by_css_selector('div[tbinfo="ouid=3937348351"]')
divs.extend(divs_Change[FLESH_TIME:FLESH_TIME + 1])
browser.close()