数据提取:百度贴吧(两种方案)

第一种需手动进行百度安全认证 

import time

from selenium import webdriver


class Tieba(object):
    def __init__(self, name):
        self.url = "https://tieba.baidu.com/f?ie=utf-8&kw={}".format(name)

        self.driver = webdriver.Edge()
        self.driver.get(self.url)
        time.sleep(20)

    def get_data(self, url):

        data = self.driver.find_elements_by_xpath('//*[@id="thread_list"]/li/div/div[2]')

        data_list = []
        for d in data:
            temp = {}
            temp["title"] = d.find_element_by_xpath('./div[1]/div[1]/a').text
            temp["link"] = d.find_element_by_xpath('./div[1]/div[1]/a').get_attribute('href')
            data_list.append(temp)

        return data_list

    def save(self, data):
        for d in data:
            print(d)

    def run(self):
        # 获取数据
        while True:
            next_url = self.url
            data = self.get_data(next_url)
            self.save(data)
            try:

                self.driver.find_element_by_xpath('//a[text()="下一页>"]').click()
                time.sleep(3)
            except:
                break


if __name__ == '__main__':
    yongjie = Tieba('永杰无间')
    yongjie.run()

第二种方案,由于百度安全验证暂未找到解决办法,只有思路代码,暂未解决

import requests
from lxml import etree


class Tieba(object):
    def __init__(self, name):
        self.url = "https://tieba.baidu.com/f?ie=utf-8&kw={}".format(name)
        self.headers = {
            "User - Agent":"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
            # "Cookie":'''BAIDUID=B3F25C8AD374555F02B32B6B1585ABE8:FG=1; BAIDUID_BFESS=B3F25C8AD374555F02B32B6B1585ABE8:FG=1; __bid_n=18a565562389216dcdc3fc; newlogin=1; BIDUPSID=B3F25C8AD374555F02B32B6B1585ABE8; PSTM=1693913218; BAIDU_WISE_UID=wapp_1694218649694_520; BDUSS=VrU2pWN3hReVFUVUZ5TVNFZGhpdFlKUS1vOUxyamN5YzhjeUllYk12a2R2aWRsRVFBQUFBJCQAAAAAAAAAAAEAAAAmFllNc2phampmZ2sAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAB0xAGUdMQBlY1; BDUSS_BFESS=VrU2pWN3hReVFUVUZ5TVNFZGhpdFlKUS1vOUxyamN5YzhjeUllYk12a2R2aWRsRVFBQUFBJCQAAAAAAAAAAAEAAAAmFllNc2phampmZ2sAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAB0xAGUdMQBlY1; STOKEN=1f95811dfbf0c4780125ed0b972e76a3f09f66ed37fc75a3b16dcc7a7f3adbcc; Hm_lvt_98b9d8c2fd6608d564bf2ac2ae642948=1694218650,1694518616; USER_JUMP=-1; BAIDU_SSP_lcr=https://cn.bing.com/; st_key_id=17; arialoadData=false; 1297684006_FRSVideoUploadTip=1; video_bubble1297684006=1; Hm_lpvt_98b9d8c2fd6608d564bf2ac2ae642948=1694518653; XFI=c3bd56e0-5160-11ee-bb6e-c3a18478afb8; BA_HECTOR=200000850ga48g2gah2l8g8i1ig0jbu1o; ZFY=d5IWnal8hW99A1O4L8xEiDpgqOkS9M8dn1Y5ccuArDw:C; ab_sr=1.0.1_ZjAyMzdhN2FkMGNlMjhmMGMyNDUzNTFjYjE4Yzk1YzYwMGZlMzA1YWNmNmViOGYwYzUyMDI0YTNhNDU3MzAwYmNlYWNlMDc5NjE0OTM5NTQ4ZTEyNjdhOGQyOTBmOGQ1MjAyMTFmYzMyMjE3M2MzYjY4MGE1MWVmZGM2ZmRmYmM1NWUzOTJlYzI1ZjQyMzk4ZjFhZDAzODY1OThkMjk4YmYyZjJjZWFlMmFmNjQ1MzQzZDcxMTM2NTAzY2Y3M2Yw; st_data=0406d932e54fadd5312f20944603be4ee511c5dabc07a1034092c8fe461a350048c7833fad22eee12a58c99cadd5e9c892a6c15fa1e6f1d80d4bbb9bd153bb68861719832a669a34f2210e845d85bb2b6624730c5cd2e7c3bebee0ea87a5c00ba4d17826f904175ec50a1476c619c26da4dbd93ea37252925316a83d67db37f25494d2cb3ca896131153ac812b2cc2bd; st_sign=5a067136; XFCS=1C82E02000E93A7C8F3E1C8A05917938D130DC573002FEF574E7173C4B3A5ED0; XFT=sbPziQA7t0aVVKyv1sbmKCH4Lm7YWwnAoL0xn9rPcRk=; RT="z=1&dm=baidu.com&si=384f3e00-9c50-4b75-99a1-655afab479de&ss=lmg8mqbi&sl=k&tt=hqb&bcn=https%3A%2F%2Ffclog.baidu.com%2Flog%2Fweirwood%3Ftype%3Dperf&ld=6zg3&ul=1dze4"'''
            # "User - Agent": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)"
        }

    def get_data(self, url):
        response = requests.get(url, headers=self.headers)
        with open("temp.html","wb")as f:
             f.write(response.content)
        return response.content

    def parse_data(self, data):
        html = etree.HTML(data)
        el_list = html.xpath('//*[@id="thread_list"]/li/div/div[2]/div[1]/div[1]/a')
        # print(el_list)
        data_list=[]
        for el in el_list:
            temp={}
            temp["title"]=el.xpath("./text()")[0]
            temp["link"] = el.xpath("./@href")[0]
            data_list.append(temp)

        next_url=""+html.xpath('''//*[@id="frs_list_pager"]/a[10]/@href''')
        return data_list,next_url
    def save(self,data_list):
        for data in data_list:
            print(data)

    def run(self):
        #         url
        #         headers
        #         发送请求
        while True:
            next_url=self
            data = self.get_data(next_url)
            data_list,next_url=self.parse_data(data)
            self.save(data_list)
            if next_url ==None:
                break


if __name__ == '__main__':
    tiba = Tieba("qq")
    tiba.run()
#

 

你可能感兴趣的:(python,selenium)