爬取任意百度贴吧评论(可直接Copy)

代码拿去就可以用,输入你要爬的贴吧名就行了。

看贴吧有多少页,range范围改为多少

爬取的评论在D:/res/info.txt中,注意控制访问频率,不要把time.sleep()删了,要不然ip会被被临时封了,也可能被验证码疯狂骚扰,如果你有ip代理池那随意了

import requests
import os
import numpy as np
from concurrent.futures import ThreadPoolExecutor
import time
import re
header={
'user-agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.76'
}

#with open('D:/res/headers.txt','r',encoding='utf-8')as f:
#    headers=f.readlines()

#proxies=[]
#with open('D:/res/https.txt','r',encoding='utf-8') as f:
#    proxies=f.readlines()
proxy={'http':'http://106.11.226.232:8009','https':'http://106.11.226.232:8009'}
def fun(url,page):
    time.sleep(10)
    #proxy={'https':'http://'+np.random.choice(proxies),'http':'http://'+np.random.choice(proxies)}
    r=requests.get(url,headers=header,timeout=10,proxies=proxy,verify=False,allow_redirects=False)
    if r.status_code!=200:
        print('error')
    print(r.text)
    hrefs=re.findall('href=".*?" title=".*?" target="_blank" class="j_th_tit ">',r.text)
    print(hrefs)
    print(page)
    if len(hrefs)==0:
        return
    
    for href in hrefs:
        time.sleep(5)

        href='https://tieba.baidu.com'+href[6:19]

        #proxy={'https':'http://'+np.random.choice(proxies),'http':'http://'+np.random.choice(proxies)}
        r=requests.get(href,headers=header,timeout=10,proxies=proxy,verify=False,allow_redirects=False)

        print(r.encoding)
        r.encoding='utf-8'
        print(page)
        texts=re.findall('class="d_post_content j_d_post_content  clearfix" style="display:;">            .*?<',r.text)
        print(texts)
        prefix='class="d_post_content j_d_post_content  clearfix" style="display:;">            '
        for i in range(len(texts)):
            texts[i]=texts[i][len(prefix):-1]
        
        if not os.path.exists('D:/res'):
            os.makedirs('D:/res')
        
        with open('D:/res/info.txt','a',encoding='utf-8') as f:
            
            f.write(' '.join(texts))
                       

#pool=ThreadPoolExecutor(2)
#500

topic=input()
for page in range(0,1000):
    url=f'https://tieba.baidu.com/f?kw={topic}&ie=utf-8&pn={page*50}'
    #pool.submit(fun,url)
    fun(url,page)
    

你可能感兴趣的:(Python,python,beautifulsoup)