import pandas as pd
import requests
from bs4 import BeautifulSoup
import requests
from lxml import etree
import re
class Job_info:
def __init__(self):
self.company = ''
self.job_name = ''
self.job_money = ''
self.job_year = ''
self.job_city = ''
self.job_people_number = ''
self.url = ''
class Info_python:
def __init__(self, company, company_url, job_url):
self.url = r'https://www.kanzhun.com'
self.company_url = company_url
self.job_url = job_url
self.company = company
self.headers = {'Accept': 'text/html, */*; q=0.01', 'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'zh-CN,zh;q=0.8',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36'}
self.cookies = {
'Cookie': 'aliyungf_tc=AQAAAD2MPjQrcwQAp4x2Dgdwc71am5e9; __c=1491732911; W_CITY_S_V=57; __g=-; isHasPushRecommentMessage=true; thirtyMinutes=true; isShowDownload=false; thirtyMinutesCount=2; pageType=2; ac="[email protected]"; __t=ZPp3Vr6QMt1cLNx; __l=r=&l=%2Fgsr194222.html%3Fka%3Dpercent-review-list; __a=29429174.1491732911..1491732911.7.1.7.7; t=ZPp3Vr6QMt1cLNx; AB_T=abvb'}
def get_company_url(self):
current_url = r'https://www.kanzhun.com/search?pageNum=1&query=' + self.company + '&type=1'
print(current_url)
response = requests.get(current_url, headers=self.headers, cookies=self.cookies)
html = response.text
soup = BeautifulSoup(html, 'lxml')
print(html)
name = soup.find_all('div', class_='middle')
print(name)
def loop_diff_page(self, page_num, url_str):
replacedStr = re.sub("/p\d+", "/p" + str(page_num), url_str)
return replacedStr
def loop_diff_page_1(self, page_num, url_str):
if page_num == 1:
replacedStr = re.sub(".html", "/p" + str(page_num) + ".html", url_str)
else:
replacedStr = re.sub("/p\d+.html", "/p" + str(page_num) + ".html", url_str)
return replacedStr
def get_all_pages_salary(self):
page_num = 1
current_url = self.loop_diff_page(page_num, self.company_url)
response = requests.get(current_url, headers=self.headers, cookies=self.cookies)
html = response.text
soup = BeautifulSoup(html, 'html.parser')
name = soup.find_all('ul', class_='rc-pagination')
a_list = name[0].find_all('li')
page_limit = int(a_list[-3].text)
all_pages_df = pd.DataFrame()
a = soup.find_all('div', class_='kz-empty-description')
while len(a) == 0:
one_page_df = self.get_one_page_salary(response)
if all_pages_df.shape[0] == 0:
all_pages_df = one_page_df
else:
all_pages_df = pd.concat([all_pages_df, one_page_df], axis=0)
page_num += 1
current_url = self.loop_diff_page(page_num, self.company_url)
response = requests.get(current_url, headers=self.headers, cookies=self.cookies)
html = response.text
soup = BeautifulSoup(html, 'html.parser')
a = soup.find_all('div', class_='kz-empty-description')
all_pages_df.insert(loc=0, column='公司全称', value=self.company)
return all_pages_df
def check_status(self, job):
dic = {}
dic['code'] = 1
if job not in ['硬件产品经理', '硬件测试工程师', '硬件测试', '硬件工程师', '硬件产品', '测试工程师', '嵌入式', '嵌入式软件工程师', '嵌入式工程师', '工业设计', '工业设计工程师', '机械结构', '机械结构工程师', '质量测试', '质量管理', '质量管理/测试', '质量管理工程师']:
dic['code'] = -1
return dic
def get_one_page_salary(self, response):
html = response.text
soup = BeautifulSoup(html, 'html.parser')
job_list = soup.find_all('a', class_='salary-item')
one_page_df = pd.DataFrame()
for job1 in job_list:
job = Job_info()
try:
job.job_name = job1.find('div', class_='name').text.strip()
a = job1.find('div', class_='extra').text.strip()
job.job_people_number = re.findall(r'\d+', a)[0]
job.job_money = job1.find('div', class_='number').text.strip()
job.url = job1['href']
job_dic = job.__dict__
detail_url = self.url + job.url
print(detail_url)
dic = self.check_status(job.job_name)
print(dic['code'])
if dic['code'] == 1:
one_position_df = self.get_all_pages_detail(detail_url)
one_position_df['job_name'] = job.job_name
one_position_df['月均值'] = job.job_money
one_position_df['贡献人数'] = job.job_people_number
if one_page_df.shape[0] == 0:
one_page_df = one_position_df
else:
one_page_df = pd.concat([one_page_df, one_position_df], axis=0)
print(one_position_df)
else:
continue
except Exception as e:
print(str(e))
return one_page_df
def get_all_pages_detail(self, url):
page_num = 1
current_url = self.loop_diff_page_1(page_num, url)
response = requests.get(current_url, headers=self.headers, cookies=self.cookies)
html = response.text
soup = BeautifulSoup(html, 'html.parser')
page_box = soup.find_all('div', class_='page-box')
a_list = page_box[0].find_all('li')
a_list_proce = [a for a in a_list if a['class'] != ['rc-pagination-options']]
page_limit = int(a_list_proce[-2].text)
print(page_limit)
all_pages_person_df = pd.DataFrame()
while page_num <= page_limit:
print(page_num)
one_page_person_df = self.get_detail_salary(response)
if all_pages_person_df.shape[0] == 0:
all_pages_person_df = one_page_person_df
else:
all_pages_person_df = pd.concat([all_pages_person_df, one_page_person_df], axis=0)
page_num += 1
current_url = self.loop_diff_page_1(page_num, current_url)
response = requests.get(current_url, headers=self.headers, cookies=self.cookies)
return all_pages_person_df
def get_detail_salary(self, response):
html = response.text
soup = BeautifulSoup(html, 'html.parser')
job_list = soup.find_all('a', class_='salary-item second')
one_page_person_df = pd.DataFrame()
for job1 in job_list:
url_2nd = self.url + job1['href']
one_person_df = self.get_detail_salary_2nd(url_2nd)
if one_page_person_df.shape[0] == 0:
one_page_person_df = one_person_df
else:
one_page_person_df = pd.concat([one_page_person_df, one_person_df], axis=0)
return one_page_person_df
def get_detail_salary_2nd(self, url):
response = requests.get(url, headers=self.headers, cookies=self.cookies)
html = response.text
soup = BeautifulSoup(html, 'html.parser')
info_list = soup.find_all('div', class_='card-container')
detail_info = {}
for infos in info_list:
for info in infos:
a = info.find('span', class_='title')
if a is None:
continue
else:
title_list = a.text.strip()
content_list = info.find('span', class_='content').text.strip()
detail_info[title_list] = content_list
one_person_df = pd.DataFrame([detail_info])
return one_person_df
if __name__ == '__main__':
company_url_dict = {
"百度": {
"company": "https://www.kanzhun.com/firm/wage/1nV_2tk~/p1.html",
"job": ""
},
"作业帮": {
"company": "https://www.kanzhun.com/firm/wage/0nJ73961GQ~~/p1.html",
"job": ""
},
"网易有道": {
"company": "https://www.kanzhun.com/firm/wage/1Xd60tg~/p1.html",
"job": ""
}
}
for key in company_url_dict:
info_python = Info_python(key, company_url_dict[key]['company'], company_url_dict[key]['job'])
df = info_python.get_all_pages_salary()
df.to_csv(r'D:\chatgpt/' + key + '1.csv', index=False, encoding='utf-8')
import pandas as pd
import requests
from bs4 import BeautifulSoup
import requests
from lxml import etree
import re
class Job_info:
def __init__(self):
self.company = ''
self.job_name = ''
self.job_money = ''
self.job_year = ''
self.job_city = ''
self.job_people_number = ''
self.url = ''
class Info_python:
def __init__(self, company, company_url, job_url):
self.url = r'https://www.kanzhun.com'
self.company_url = company_url
self.job_url = job_url
self.company = company
self.headers = {'Accept': 'text/html, */*; q=0.01', 'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'zh-CN,zh;q=0.8',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36'}
self.cookies = {
'Cookie': 'aliyungf_tc=AQAAAD2MPjQrcwQAp4x2Dgdwc71am5e9; __c=1491732911; W_CITY_S_V=57; __g=-; isHasPushRecommentMessage=true; thirtyMinutes=true; isShowDownload=false; thirtyMinutesCount=2; pageType=2; ac="[email protected]"; __t=ZPp3Vr6QMt1cLNx; __l=r=&l=%2Fgsr194222.html%3Fka%3Dpercent-review-list; __a=29429174.1491732911..1491732911.7.1.7.7; t=ZPp3Vr6QMt1cLNx; AB_T=abvb'}
def get_company_url(self):
current_url = r'https://www.kanzhun.com/search?pageNum=1&query=' + self.company + '&type=1'
print(current_url)
response = requests.get(current_url, headers=self.headers, cookies=self.cookies)
html = response.text
soup = BeautifulSoup(html, 'lxml')
print(html)
name = soup.find_all('div', class_='middle')
print(name)
def loop_diff_page(self, page_num, url_str):
replacedStr = re.sub("/p\d+", "/p" + str(page_num), url_str)
return replacedStr
def loop_diff_page_1(self, page_num, url_str):
if page_num == 1:
replacedStr = re.sub(".html", "/p" + str(page_num) + ".html", url_str)
else:
replacedStr = re.sub("/p\d+.html", "/p" + str(page_num) + ".html", url_str)
return replacedStr
def get_all_pages_salary(self):
page_num = 1
current_url = self.loop_diff_page(page_num, self.company_url)
response = requests.get(current_url, headers=self.headers, cookies=self.cookies)
html = response.text
soup = BeautifulSoup(html, 'html.parser')
name = soup.find_all('ul', class_='rc-pagination')
a_list = name[0].find_all('li')
page_limit = int(a_list[-3].text)
all_pages_df = pd.DataFrame()
a = soup.find_all('div', class_='kz-empty-description')
while len(a) == 0:
one_page_df = self.get_one_page_salary(response)
if all_pages_df.shape[0] == 0:
all_pages_df = one_page_df
else:
all_pages_df = pd.concat([all_pages_df, one_page_df], axis=0)
page_num += 1
current_url = self.loop_diff_page(page_num, self.company_url)
response = requests.get(current_url, headers=self.headers, cookies=self.cookies)
a = soup.find_all('div', class_='kz-empty-description')
all_pages_df.insert(loc=0, column='公司全称', value=self.company)
return all_pages_df
def get_one_page_salary(self, response):
html = response.text
soup = BeautifulSoup(html, 'html.parser')
job_list = soup.find_all('a', class_='salary-item')
one_page_df = pd.DataFrame()
for job1 in job_list:
job = Job_info()
try:
job.job_name = job1.find('div', class_='name').text.strip()
a = job1.find('div', class_='extra').text.strip()
job.job_people_number = re.findall(r'\d+', a)
job.job_money = job1.find('div', class_='number').text.strip()
job.url = job1['href']
job_dic = job.__dict__
detail_url = self.url + job.url
print(detail_url)
one_position_df = self.get_all_pages_detail(detail_url)
one_position_df['job_name'] = job.job_name
one_position_df['月均值'] = job.job_money
one_position_df['贡献人数'] = job.job_people_number
if one_page_df.shape[0] == 0:
one_page_df = one_position_df
else:
one_page_df = pd.concat([one_page_df, one_position_df], axis=0)
print(one_position_df)
except Exception as e:
print(str(e))
return one_page_df
def get_all_pages_detail(self, url):
page_num = 1
current_url = self.loop_diff_page_1(page_num, url)
response = requests.get(current_url, headers=self.headers, cookies=self.cookies)
html = response.text
soup = BeautifulSoup(html, 'html.parser')
name = soup.find_all('ul', class_='rc-pagination')
a_list = name[0].find_all('li')
page_limit = int(a_list[-3].text)
all_pages_person_df = pd.DataFrame()
while page_num <= page_limit:
one_page_person_df = self.get_detail_salary(response)
if all_pages_person_df.shape[0] == 0:
all_pages_person_df = one_page_person_df
else:
all_pages_person_df = pd.concat([all_pages_person_df, one_page_person_df], axis=0)
page_num += 1
current_url = self.loop_diff_page_1(page_num, current_url)
response = requests.get(current_url, headers=self.headers, cookies=self.cookies)
return all_pages_person_df
def get_detail_salary(self, response):
html = response.text
soup = BeautifulSoup(html, 'html.parser')
job_list = soup.find_all('a', class_='salary-item second')
one_page_person_df = pd.DataFrame()
for job1 in job_list:
url_2nd = self.url + job1['href']
one_person_df = self.get_detail_salary_2nd(url_2nd)
if one_page_person_df.shape[0] == 0:
one_page_person_df = one_person_df
else:
one_page_person_df = pd.concat([one_page_person_df, one_person_df], axis=0)
return one_page_person_df
def get_detail_salary_2nd(self, url):
response = requests.get(url, headers=self.headers, cookies=self.cookies)
html = response.text
soup = BeautifulSoup(html, 'html.parser')
info_list = soup.find_all('div', class_='card-container')
detail_info = {}
for infos in info_list:
for info in infos:
a = info.find('span', class_='title')
if a is None:
continue
else:
title_list = a.text.strip()
content_list = info.find('span', class_='content').text.strip()
detail_info[title_list] = content_list
one_person_df = pd.DataFrame([detail_info])
return one_person_df
if __name__ == '__main__':
company_url_dict = {
"科大讯飞": {
"company": "https://www.kanzhun.com/firm/wage/0nZz3tk~/p1.html",
"job": ""
},
"百度": {
"company": "httzzps://www.kanzhun.com/firm/wage/1nV_2tk~/p2.html",
"job": ""
},
"作业帮": {
"company": "https://www.kanzhun.com/firm/wage/0nJ73961GQ~~/p1.html",
"job": ""
},
"网易有道": {
"company": "https://www.kanzhun.com/firm/wage/1Xd60tg~/p1.html",
"job": ""
}
}
info_python = Info_python('科大讯飞', 'https://www.kanzhun.com/firm/wage/0nZz3tk~/p1.html', '')
df = info_python.get_all_pages_salary()
df.to_csv(r'' + key + '1.csv', encoding='gbk')
import pandas as pd
import requests
from bs4 import BeautifulSoup
import requests
from lxml import etree
import re
class Job_info:
def __init__(self):
self.company = ''
self.job_name = ''
self.job_money = ''
self.job_year = ''
self.job_city = ''
self.job_people_number = ''
self.url = ''
class Info_python:
def __init__(self, company):
self.url = r'https://www.kanzhun.com'
self.company = company
self.headers = {'Accept': 'text/html, */*; q=0.01', 'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'zh-CN,zh;q=0.8',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36'}
self.cookies = {
'Cookie': 'aliyungf_tc=AQAAAD2MPjQrcwQAp4x2Dgdwc71am5e9; __c=1491732911; W_CITY_S_V=57; __g=-; isHasPushRecommentMessage=true; thirtyMinutes=true; isShowDownload=false; thirtyMinutesCount=2; pageType=2; ac="[email protected]"; __t=ZPp3Vr6QMt1cLNx; __l=r=&l=%2Fgsr194222.html%3Fka%3Dpercent-review-list; __a=29429174.1491732911..1491732911.7.1.7.7; t=ZPp3Vr6QMt1cLNx; AB_T=abvb'}
def get_company_url(self):
current_url = r'https://www.kanzhun.com/search?pageNum=1&query=' + self.company + '&type=1'
print(current_url)
response = requests.get(current_url, headers=self.headers, cookies=self.cookies)
html = response.text
soup = BeautifulSoup(html, 'lxml')
print(html)
name = soup.find_all('div', class_='middle')
print(name)
def get_total_salary(self):
current_url = r'https://www.kanzhun.com/firm/wage/0nJ92tS5EQ~~/p1.html'
response = requests.get(current_url, headers=self.headers, cookies=self.cookies)
html = response.text
soup = BeautifulSoup(html, 'html.parser')
job_list = soup.find_all('a', class_='salary-item')
for job1 in job_list:
job = Job_info()
try:
job.job_name = job1.find('div', class_='name').text.strip()
a = job1.find('div', class_='extra').text.strip()
job.job_people_number = re.findall(r'\d+', a)
job.job_money = job1.find('div', class_='number').text.strip()
job.url = job1['href']
job_dic = job.__dict__
detail_url = self.url + job.url
position_info_df = self.get_detail_salary(detail_url)
position_info_df['job_name'] = job.job_name
position_info_df['月均值'] = job.job_money
position_info_df['贡献人数'] = job.job_people_number
except Exception as e:
print(str(e))
def get_detail_salary(self, url):
print(url)
response = requests.get(url, headers=self.headers, cookies=self.cookies)
html = response.text
soup = BeautifulSoup(html, 'html.parser')
job_list = soup.find_all('a', class_='salary-item second')
position_info_df = pd.DataFrame()
for job1 in job_list:
url_2nd = self.url + job1['href']
detail_info_df = self.get_detail_salary_2nd(url_2nd)
if position_info_df.shape[0] == 0:
position_info_df = detail_info_df
else:
position_info_df = pd.concat([position_info_df, detail_info_df], axis=0)
return position_info_df
def get_detail_salary_2nd(self, url):
print(url)
response = requests.get(url, headers=self.headers, cookies=self.cookies)
html = response.text
soup = BeautifulSoup(html, 'html.parser')
info_list = soup.find_all('div', class_='card-container')
detail_info = {}
for infos in info_list:
for info in infos:
a = info.find('span', class_='title')
if a is None:
continue
else:
title_list = a.text.strip()
content_list = info.find('span', class_='content').text.strip()
detail_info[title_list] = content_list
detail_info_df = pd.DataFrame([detail_info])
return detail_info_df
if __name__ == '__main__':
a = Info_python('宁德时代')
a.get_company_url()