python爬虫


import pandas as pd
import requests
from bs4 import BeautifulSoup
#
import requests
from lxml import etree
import re


class Job_info:
    def __init__(self):
        self.company = ''
        self.job_name = ''
        self.job_money = ''
        self.job_year = ''
        self.job_city = ''
        self.job_people_number = ''
        self.url = ''


class Info_python:
    def __init__(self, company, company_url, job_url):
        self.url = r'https://www.kanzhun.com'
        self.company_url = company_url
        self.job_url = job_url
        self.company = company
        self.headers = {'Accept': 'text/html, */*; q=0.01', 'Accept-Encoding': 'gzip, deflate, sdch',
                        'Accept-Language': 'zh-CN,zh;q=0.8',
                        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36'}
        self.cookies = {
            'Cookie': 'aliyungf_tc=AQAAAD2MPjQrcwQAp4x2Dgdwc71am5e9; __c=1491732911; W_CITY_S_V=57; __g=-; isHasPushRecommentMessage=true; thirtyMinutes=true; isShowDownload=false; thirtyMinutesCount=2; pageType=2; ac="[email protected]"; __t=ZPp3Vr6QMt1cLNx; __l=r=&l=%2Fgsr194222.html%3Fka%3Dpercent-review-list; __a=29429174.1491732911..1491732911.7.1.7.7; t=ZPp3Vr6QMt1cLNx; AB_T=abvb'}

    def get_company_url(self):
        current_url = r'https://www.kanzhun.com/search?pageNum=1&query=' + self.company + '&type=1'
        print(current_url)
        response = requests.get(current_url, headers=self.headers, cookies=self.cookies)
        html = response.text
        soup = BeautifulSoup(html, 'lxml')
        print(html)
        name = soup.find_all('div', class_='middle')
        print(name)

    def loop_diff_page(self, page_num, url_str):
        replacedStr = re.sub("/p\d+", "/p" + str(page_num), url_str)
        return replacedStr

    def loop_diff_page_1(self, page_num, url_str):
        if page_num == 1:
            replacedStr = re.sub(".html", "/p" + str(page_num) + ".html", url_str)
        else:
            replacedStr = re.sub("/p\d+.html", "/p" + str(page_num) + ".html", url_str)
        return replacedStr

    def get_all_pages_salary(self):  # 公司页面下所有岗位页的遍历
        page_num = 1
        current_url = self.loop_diff_page(page_num, self.company_url)
        response = requests.get(current_url, headers=self.headers, cookies=self.cookies)
        html = response.text
        soup = BeautifulSoup(html, 'html.parser')
        name = soup.find_all('ul', class_='rc-pagination')
        a_list = name[0].find_all('li')
        page_limit = int(a_list[-3].text)
        all_pages_df = pd.DataFrame()
        # a = soup.find_all('div', class_='kz-empty-description')[0].text
        a = soup.find_all('div', class_='kz-empty-description')
        # while a != " 没有人发布过这家公司的工资信息":  # 遍历直到没有信息
        while len(a) == 0:
            one_page_df = self.get_one_page_salary(response)
            if all_pages_df.shape[0] == 0:
                all_pages_df = one_page_df
            else:
                all_pages_df = pd.concat([all_pages_df, one_page_df], axis=0)
            page_num += 1
            current_url = self.loop_diff_page(page_num, self.company_url)
            response = requests.get(current_url, headers=self.headers, cookies=self.cookies)
            # a = soup.find_all('div', class_='kz-empty-description')[0].text
            html = response.text
            soup = BeautifulSoup(html, 'html.parser')
            a = soup.find_all('div', class_='kz-empty-description')
        # print(all_pages_df)
        all_pages_df.insert(loc=0, column='公司全称', value=self.company)
        return all_pages_df

    def check_status(self, job):
        dic = {}
        dic['code'] = 1
        if job not in ['硬件产品经理', '硬件测试工程师', '硬件测试', '硬件工程师', '硬件产品', '测试工程师', '嵌入式', '嵌入式软件工程师', '嵌入式工程师', '工业设计', '工业设计工程师', '机械结构', '机械结构工程师', '质量测试', '质量管理', '质量管理/测试', '质量管理工程师']:
        # if job != '工业工程师':
            dic['code'] = -1
        return dic

    def get_one_page_salary(self, response):  # 岗位页下每个岗位的遍历
        # current_url = r'https://www.kanzhun.com/firm/wage/0nJ92tS5EQ~~/p1.html'
        # current_url = self.loop_diff_page(page_num)
        # response = requests.get(current_url, headers=self.headers, cookies=self.cookies)
        html = response.text
        soup = BeautifulSoup(html, 'html.parser')
        # job_list = soup.find_all('div', class_='salary-list_1vfxU')
        job_list = soup.find_all('a', class_='salary-item')
        one_page_df = pd.DataFrame()
        for job1 in job_list:  #
            job = Job_info()
            try:
                job.job_name = job1.find('div', class_='name').text.strip()
                a = job1.find('div', class_='extra').text.strip()
                job.job_people_number = re.findall(r'\d+', a)[0]
                job.job_money = job1.find('div', class_='number').text.strip()
                job.url = job1['href']
                job_dic = job.__dict__
                detail_url = self.url + job.url
                print(detail_url)
                dic = self.check_status(job.job_name)
                print(dic['code'])
                if dic['code'] == 1:
                    one_position_df = self.get_all_pages_detail(detail_url)  # 已拿到公司-岗位url,准备进到公司-岗位,拿个人单挑信息
                    one_position_df['job_name'] = job.job_name
                    one_position_df['月均值'] = job.job_money
                    one_position_df['贡献人数'] = job.job_people_number
                    if one_page_df.shape[0] == 0:
                        one_page_df = one_position_df
                    else:
                        one_page_df = pd.concat([one_page_df, one_position_df], axis=0)
                    print(one_position_df)
                else:
                    continue
            except Exception as e:
                print(str(e))
            # if one_page_df.shape[0] == 0:
            #     one_page_df = one_position_df
            # else:
            #     one_page_df = pd.concat([one_page_df, one_position_df], axis=0)
        return one_page_df

    def get_all_pages_detail(self, url):
        page_num = 1
        current_url = self.loop_diff_page_1(page_num, url)
        # print(current_url)
        response = requests.get(current_url, headers=self.headers, cookies=self.cookies)
        html = response.text
        # print(html)
        soup = BeautifulSoup(html, 'html.parser')
        page_box = soup.find_all('div', class_='page-box')
        # print(page_box)
        # name = soup.find('ul', class_='rc-pagination')
        # print(name)
        a_list = page_box[0].find_all('li')
        # print(a_list)
        # print(a_list)
        a_list_proce = [a for a in a_list if a['class'] != ['rc-pagination-options']]
        # print(a_list_proce)
        # if not a_list[-1].text:
        #     page_limit = int(a_list[-3].text)
        # else:
        #     page_limit = int(a_list[-2].text)
        page_limit = int(a_list_proce[-2].text)

        print(page_limit)
        all_pages_person_df = pd.DataFrame()
        # a = soup.find_all('div', class_='kz-empty-description')[0].text
        while page_num <= page_limit:
            print(page_num)
            one_page_person_df = self.get_detail_salary(response)
            # print(current_url)
            # print(one_page_person_df)
            if all_pages_person_df.shape[0] == 0:
                all_pages_person_df = one_page_person_df
            else:
                all_pages_person_df = pd.concat([all_pages_person_df, one_page_person_df], axis=0)
            page_num += 1
            current_url = self.loop_diff_page_1(page_num, current_url)
            response = requests.get(current_url, headers=self.headers, cookies=self.cookies)
            # a = soup.find_all('div', class_='kz-empty-description')[0].text

            # print(all_pages_person_df)
        return all_pages_person_df

    def get_detail_salary(self, response):  # 个人单条数据遍历
        # print(url)
        # response = requests.get(url, headers=self.headers, cookies=self.cookies)
        html = response.text
        soup = BeautifulSoup(html, 'html.parser')
        job_list = soup.find_all('a', class_='salary-item second')
        one_page_person_df = pd.DataFrame()
        for job1 in job_list:
            url_2nd = self.url + job1['href']
            one_person_df = self.get_detail_salary_2nd(url_2nd)  # 拿到公司-岗位-个人url
            if one_page_person_df.shape[0] == 0:
                one_page_person_df = one_person_df
            else:
                one_page_person_df = pd.concat([one_page_person_df, one_person_df], axis=0)

        return one_page_person_df

    def get_detail_salary_2nd(self, url):  # 个人详细数据
        # print(url)
        response = requests.get(url, headers=self.headers, cookies=self.cookies)
        html = response.text  # 拿到公司-岗位-个人 detail
        soup = BeautifulSoup(html, 'html.parser')
        info_list = soup.find_all('div', class_='card-container')

        detail_info = {}
        for infos in info_list:  # 遍历每个blocker
            for info in infos:  # 遍历每行
                a = info.find('span', class_='title')
                if a is None:
                    continue
                else:
                    title_list = a.text.strip()
                    content_list = info.find('span', class_='content').text.strip()
                    detail_info[title_list] = content_list
                    # print(detail_info)
        one_person_df = pd.DataFrame([detail_info])
        return one_person_df


#  岗位循环可为空
#  个人循环需判断是否为重复


if __name__ == '__main__':
    # replacedStr = re.sub("/p\d+", "/p2", 'https://www.kanzhun.com/firm/wage/0nZz3tk~/p1.html')
    # print(replacedStr)
    company_url_dict = {
        # "科大讯飞": {
        #     "company": "https://www.kanzhun.com/firm/wage/0nZz3tk~/p1.html",
        #     "job": ""
        # },
        "百度": {
            "company": "https://www.kanzhun.com/firm/wage/1nV_2tk~/p1.html",
            "job": ""
        },
        "作业帮": {
            "company": "https://www.kanzhun.com/firm/wage/0nJ73961GQ~~/p1.html",
            "job": ""
        },
        "网易有道": {
            "company": "https://www.kanzhun.com/firm/wage/1Xd60tg~/p1.html",
            "job": ""
        }
    }

    # info_python = Info_python('江苏彤明', 'https://www.kanzhun.com/firm/wage/1nF729i5GQ~~/p1.html', '')
    # df = info_python.get_all_pages_salary()
    # print(df)
    # df.to_csv(r'D:\chatgpt/' + '江苏彤明.csv', encoding='utf-8')
    for key in company_url_dict:
        info_python = Info_python(key, company_url_dict[key]['company'], company_url_dict[key]['job'])
        df = info_python.get_all_pages_salary()
        df.to_csv(r'D:\chatgpt/' + key + '1.csv', index=False, encoding='utf-8')

import pandas as pd
import requests
from bs4 import BeautifulSoup
#
import requests
from lxml import etree
import re


class Job_info:
    def __init__(self):
        self.company = ''
        self.job_name = ''
        self.job_money = ''
        self.job_year = ''
        self.job_city = ''
        self.job_people_number = ''
        self.url = ''


class Info_python:
    def __init__(self, company, company_url, job_url):
        self.url = r'https://www.kanzhun.com'
        self.company_url = company_url
        self.job_url = job_url
        self.company = company
        self.headers = {'Accept': 'text/html, */*; q=0.01', 'Accept-Encoding': 'gzip, deflate, sdch',
                   'Accept-Language': 'zh-CN,zh;q=0.8',
                   'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36'}
        self.cookies = {
            'Cookie': 'aliyungf_tc=AQAAAD2MPjQrcwQAp4x2Dgdwc71am5e9; __c=1491732911; W_CITY_S_V=57; __g=-; isHasPushRecommentMessage=true; thirtyMinutes=true; isShowDownload=false; thirtyMinutesCount=2; pageType=2; ac="[email protected]"; __t=ZPp3Vr6QMt1cLNx; __l=r=&l=%2Fgsr194222.html%3Fka%3Dpercent-review-list; __a=29429174.1491732911..1491732911.7.1.7.7; t=ZPp3Vr6QMt1cLNx; AB_T=abvb'}

    def get_company_url(self):
        current_url = r'https://www.kanzhun.com/search?pageNum=1&query=' + self.company + '&type=1'
        print(current_url)
        response = requests.get(current_url, headers=self.headers, cookies=self.cookies)
        html = response.text
        soup = BeautifulSoup(html, 'lxml')
        print(html)
        name = soup.find_all('div', class_='middle')
        print(name)

    def loop_diff_page(self, page_num, url_str):
        replacedStr = re.sub("/p\d+", "/p" + str(page_num), url_str)
        return replacedStr

    def loop_diff_page_1(self, page_num, url_str):
        if page_num == 1:
            replacedStr = re.sub(".html", "/p" + str(page_num) + ".html", url_str)
        else:
            replacedStr = re.sub("/p\d+.html", "/p" + str(page_num) + ".html", url_str)
        return replacedStr

    def get_all_pages_salary(self):  # 公司页面下所有岗位页的遍历
        page_num = 1
        current_url = self.loop_diff_page(page_num, self.company_url)
        response = requests.get(current_url, headers=self.headers, cookies=self.cookies)
        html = response.text
        soup = BeautifulSoup(html, 'html.parser')
        name = soup.find_all('ul', class_='rc-pagination')
        a_list = name[0].find_all('li')
        page_limit = int(a_list[-3].text)
        all_pages_df = pd.DataFrame()
        # a = soup.find_all('div', class_='kz-empty-description')[0].text
        a = soup.find_all('div', class_='kz-empty-description')
        # while a != " 没有人发布过这家公司的工资信息":  # 遍历直到没有信息
        while len(a) == 0:
            one_page_df = self.get_one_page_salary(response)
            if all_pages_df.shape[0] == 0:
                all_pages_df = one_page_df
            else:
                all_pages_df = pd.concat([all_pages_df, one_page_df], axis=0)
            page_num += 1
            current_url = self.loop_diff_page(page_num, self.company_url)
            response = requests.get(current_url, headers=self.headers, cookies=self.cookies)
            # a = soup.find_all('div', class_='kz-empty-description')[0].text
            a = soup.find_all('div', class_='kz-empty-description')

        all_pages_df.insert(loc=0, column='公司全称', value=self.company)
        return all_pages_df
      

    def get_one_page_salary(self, response):   # 岗位页下每个岗位的遍历
        # current_url = r'https://www.kanzhun.com/firm/wage/0nJ92tS5EQ~~/p1.html'
        # current_url = self.loop_diff_page(page_num)
        # response = requests.get(current_url, headers=self.headers, cookies=self.cookies)
        html = response.text
        soup = BeautifulSoup(html, 'html.parser')
        # job_list = soup.find_all('div', class_='salary-list_1vfxU')
        job_list = soup.find_all('a', class_='salary-item')
        one_page_df = pd.DataFrame()
        for job1 in job_list:  # 
            job = Job_info()
            try:
                job.job_name = job1.find('div', class_='name').text.strip()
                a = job1.find('div', class_='extra').text.strip()
                job.job_people_number = re.findall(r'\d+', a)
                job.job_money = job1.find('div', class_='number').text.strip()
                job.url = job1['href']
                job_dic = job.__dict__
                detail_url = self.url + job.url
                print(detail_url)
                one_position_df = self.get_all_pages_detail(detail_url)  # 已拿到公司-岗位url,准备进到公司-岗位,拿个人单挑信息
                one_position_df['job_name'] = job.job_name
                one_position_df['月均值'] = job.job_money
                one_position_df['贡献人数'] = job.job_people_number
                if one_page_df.shape[0] == 0:
                    one_page_df = one_position_df
                else:
                    one_page_df = pd.concat([one_page_df, one_position_df], axis=0)
                print(one_position_df)
            except Exception as e:
                print(str(e))
            # if one_page_df.shape[0] == 0:
            #     one_page_df = one_position_df
            # else:
            #     one_page_df = pd.concat([one_page_df, one_position_df], axis=0)
        return one_page_df
    
    def get_all_pages_detail(self, url):
        page_num = 1
        current_url = self.loop_diff_page_1(page_num, url)
        # print(current_url)
        response = requests.get(current_url, headers=self.headers, cookies=self.cookies)
        html = response.text
        # print(html)
        soup = BeautifulSoup(html, 'html.parser')
        name = soup.find_all('ul', class_='rc-pagination')
        # print(name)
        a_list = name[0].find_all('li')
        # print(a_list)
        page_limit = int(a_list[-3].text)
        # print(page_limit)
        all_pages_person_df = pd.DataFrame()
        # a = soup.find_all('div', class_='kz-empty-description')[0].text
        while page_num <= page_limit:
            one_page_person_df = self.get_detail_salary(response)
            # print(current_url)
            # print(one_page_person_df)
            if all_pages_person_df.shape[0] == 0:
                all_pages_person_df = one_page_person_df
            else:
                all_pages_person_df = pd.concat([all_pages_person_df, one_page_person_df], axis=0)
            page_num += 1
            current_url = self.loop_diff_page_1(page_num, current_url)
            response = requests.get(current_url, headers=self.headers, cookies=self.cookies)
            # a = soup.find_all('div', class_='kz-empty-description')[0].text
        return all_pages_person_df

    def get_detail_salary(self, response):  # 个人单条数据遍历
        # print(url)
        # response = requests.get(url, headers=self.headers, cookies=self.cookies)
        html = response.text
        soup = BeautifulSoup(html, 'html.parser')
        job_list = soup.find_all('a', class_='salary-item second')
        one_page_person_df = pd.DataFrame()
        for job1 in job_list:
            url_2nd = self.url + job1['href']
            one_person_df = self.get_detail_salary_2nd(url_2nd)  # 拿到公司-岗位-个人url
            if one_page_person_df.shape[0] == 0:
                one_page_person_df = one_person_df
            else:
                one_page_person_df = pd.concat([one_page_person_df, one_person_df], axis=0)

        return one_page_person_df

    def get_detail_salary_2nd(self, url):  # 个人详细数据
        # print(url)
        response = requests.get(url, headers=self.headers, cookies=self.cookies)
        html = response.text  # 拿到公司-岗位-个人 detail
        soup = BeautifulSoup(html, 'html.parser')
        info_list = soup.find_all('div', class_='card-container')

        detail_info = {}
        for infos in info_list:  # 遍历每个blocker
            for info in infos:  # 遍历每行
                a = info.find('span', class_='title')
                if a is None:
                    continue
                else:
                    title_list = a.text.strip()
                    content_list = info.find('span', class_='content').text.strip()
                    detail_info[title_list] = content_list
                    # print(detail_info)
        one_person_df = pd.DataFrame([detail_info])
        return one_person_df

#  岗位循环可为空
#  个人循环需判断是否为重复


if __name__ == '__main__':
    # replacedStr = re.sub("/p\d+", "/p2", 'https://www.kanzhun.com/firm/wage/0nZz3tk~/p1.html')
    # print(replacedStr)
    company_url_dict = {
        "科大讯飞": {
            "company": "https://www.kanzhun.com/firm/wage/0nZz3tk~/p1.html",
            "job": ""
            },
        "百度": {
            "company": "httzzps://www.kanzhun.com/firm/wage/1nV_2tk~/p2.html",
            "job": ""
            },
        "作业帮": {
            "company": "https://www.kanzhun.com/firm/wage/0nJ73961GQ~~/p1.html",
            "job": ""
            },
        "网易有道": {
            "company": "https://www.kanzhun.com/firm/wage/1Xd60tg~/p1.html",
            "job": ""
            }
    }
    # for key, value in company_url_dict:
    #     print(key, value)
    info_python = Info_python('科大讯飞', 'https://www.kanzhun.com/firm/wage/0nZz3tk~/p1.html', '')
    df = info_python.get_all_pages_salary()
    df.to_csv(r'' + key + '1.csv', encoding='gbk')

    

import pandas as pd
import requests
from bs4 import BeautifulSoup
#
import requests
from lxml import etree
import re


class Job_info:
    def __init__(self):
        self.company = ''
        self.job_name = ''
        self.job_money = ''
        self.job_year = ''
        self.job_city = ''
        self.job_people_number = ''
        self.url = ''


class Info_python:
    def __init__(self, company):
        self.url = r'https://www.kanzhun.com'
        self.company = company
        self.headers = {'Accept': 'text/html, */*; q=0.01', 'Accept-Encoding': 'gzip, deflate, sdch',
                   'Accept-Language': 'zh-CN,zh;q=0.8',
                   'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36'}
        self.cookies = {
            'Cookie': 'aliyungf_tc=AQAAAD2MPjQrcwQAp4x2Dgdwc71am5e9; __c=1491732911; W_CITY_S_V=57; __g=-; isHasPushRecommentMessage=true; thirtyMinutes=true; isShowDownload=false; thirtyMinutesCount=2; pageType=2; ac="[email protected]"; __t=ZPp3Vr6QMt1cLNx; __l=r=&l=%2Fgsr194222.html%3Fka%3Dpercent-review-list; __a=29429174.1491732911..1491732911.7.1.7.7; t=ZPp3Vr6QMt1cLNx; AB_T=abvb'}

    def get_company_url(self):
        current_url = r'https://www.kanzhun.com/search?pageNum=1&query=' + self.company + '&type=1'
        print(current_url)
        response = requests.get(current_url, headers=self.headers, cookies=self.cookies)
        html = response.text
        soup = BeautifulSoup(html, 'lxml')
        print(html)
        name = soup.find_all('div', class_='middle')
        print(name)

    def get_total_salary(self):
        current_url = r'https://www.kanzhun.com/firm/wage/0nJ92tS5EQ~~/p1.html'
        response = requests.get(current_url, headers=self.headers, cookies=self.cookies)
        html = response.text
        soup = BeautifulSoup(html, 'html.parser')
        # job_list = soup.find_all('div', class_='salary-list_1vfxU')
        job_list = soup.find_all('a', class_='salary-item')
        for job1 in job_list:
            job = Job_info()
            try:
                job.job_name = job1.find('div', class_='name').text.strip()
                a = job1.find('div', class_='extra').text.strip()
                job.job_people_number = re.findall(r'\d+', a)
                job.job_money = job1.find('div', class_='number').text.strip()
                job.url = job1['href']
                job_dic = job.__dict__
                detail_url = self.url + job.url
                position_info_df = self.get_detail_salary(detail_url)  # 拿到公司-岗位url,准备进到公司-岗位
                position_info_df['job_name'] = job.job_name
                position_info_df['月均值'] = job.job_money
                position_info_df['贡献人数'] = job.job_people_number

            except Exception as e:
                print(str(e))

    def get_detail_salary(self, url):
        print(url)
        response = requests.get(url, headers=self.headers, cookies=self.cookies)
        html = response.text
        soup = BeautifulSoup(html, 'html.parser')
        job_list = soup.find_all('a', class_='salary-item second')
        position_info_df = pd.DataFrame()
        for job1 in job_list:
            url_2nd = self.url + job1['href']
            detail_info_df = self.get_detail_salary_2nd(url_2nd)  # 拿到公司-岗位-个人url
            if position_info_df.shape[0] == 0:
                position_info_df = detail_info_df
            else:
                position_info_df = pd.concat([position_info_df, detail_info_df], axis=0)

        return position_info_df

    def get_detail_salary_2nd(self, url):
        print(url)
        response = requests.get(url, headers=self.headers, cookies=self.cookies)
        html = response.text  # 拿到公司-岗位-个人 detail
        soup = BeautifulSoup(html, 'html.parser')
        info_list = soup.find_all('div', class_='card-container')

        detail_info = {}
        for infos in info_list:  # 遍历每个blocker
            for info in infos:  # 遍历每行
                a = info.find('span', class_='title')
                if a is None:
                    continue
                else:
                    title_list = a.text.strip()
                    content_list = info.find('span', class_='content').text.strip()
                    detail_info[title_list] = content_list
                    # print(detail_info)
        detail_info_df = pd.DataFrame([detail_info])
        return detail_info_df

#  岗位循环可为空
#  个人循环需判断是否为重复


if __name__ == '__main__':
    a = Info_python('宁德时代')
    a.get_company_url()


# # 需要的公司
# needed_companys = ['科大讯飞', '百度', '作业帮', '网易有道']
# # 需要的岗位
# needed_position = ['硬件产品经理', '硬件测试工程师', '硬件工程师', '嵌入式软件工程师']




# headers = {'Accept': 'text/html, */*; q=0.01', 'Accept-Encoding': 'gzip, deflate, sdch',
#            'Accept-Language': 'zh-CN,zh;q=0.8',
#            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36'}
# cookies = {
#     'Cookie': 'aliyungf_tc=AQAAAD2MPjQrcwQAp4x2Dgdwc71am5e9; __c=1491732911; W_CITY_S_V=57; __g=-; isHasPushRecommentMessage=true; thirtyMinutes=true; isShowDownload=false; thirtyMinutesCount=2; pageType=2; ac="[email protected]"; __t=ZPp3Vr6QMt1cLNx; __l=r=&l=%2Fgsr194222.html%3Fka%3Dpercent-review-list; __a=29429174.1491732911..1491732911.7.1.7.7; t=ZPp3Vr6QMt1cLNx; AB_T=abvb'}
# url1 = 'https://www.kanzhun.com/search?pageNum=1&query=%E5%AE%81%E5%BE%B7%E6%97%B6%E4%BB%A3&type=4'
# response = requests.get(url1, headers=headers, cookies=cookies)
# soup = BeautifulSoup(response.text, 'lxml')
# print(soup)
# name = soup.find_all('div', class_='company-name-left')
# print(name)
#
# url2 = '&cityCode=&sortMethod=1&employeeStatus=0'
# name2 = []  # 合并name字段各列表内容
# score2 = []  # 合并score字段各列表内容
# content2 = []  # 合并content字段各列表内容
# question2 = []
# for i in range(1, 8):
#     url = url1 + str(i) + url2
#     response = requests.get(url, headers=headers, cookies=cookies)
#     soup = BeautifulSoup(response.text, 'lxml')
#     name = soup.find_all('p', class_='f_14 grey_99 dd_bot')
#     for n in name:
#         name1 = n.get_text()
#         name2.append(name1)
#     score = soup.find_all('span', class_='grade')
#     for s in score:
#         score1 = s.get_text()
#         score2.append(score1)
#     content = soup.find_all('h3', class_='question_title')
#     for c in content:
#         content1 = c.get_text()
#         content11 = content1.replace('\n', '')
#         content2.append(content11)
#     question = soup.find_all('p', class_='question_content')
#     for q in question:
#         question1 = q.get_text()
#         question1 = question1.replace('\n', '')
#         question2.append(question1)
#         print(len(question1))
#
# table = pd.DataFrame({'name': name2, 'score': score2, 'content': content2})
# print(table)
# from selenium import webdriver
#
# url = "https://www.kanzhun.com/firm/wage/0nJ92tS5EQ~~.html"
# driver = webdriver.Chrome()
# driver.get(url)

# search_input = driver.find_element_by_name("wd")
# search_input.send_keys("Python")
# search_input.submit()

# html = driver.page_source
# soup = BeautifulSoup(html, 'lxml')
# # print(soup)
# name = soup.find_all('a')
# print(html)
#
# driver.quit()
# import scrapy
#
# class MySpider(scrapy.Spider):
#     name = "myspider"
#     start_urls = [
#         "http://www.example.com",
#     ]
#
#     def parse(self, response):
#         results = response.css("#element_id").extract_first()
#         print(results)

你可能感兴趣的:(python基础与实践,python,爬虫,开发语言)