1.python爬取招聘信息
简单爬取智联招聘职位信息
# !/usr/bin/env python # -*-coding:utf-8-*- """ @Author : xiaofeng @Time : 2018/12/18 16:31 @Desc : Less interests,More interest. (爬取智联招聘职位数据) @Project : python_appliction @FileName: zhilianzhaopin.py @Software: PyCharm @Blog :https://blog.csdn.net/zwx19921215 """ import pymysql as db import requests # mysql配置信息 mysql_config = { 'host': '101.0.2.110', 'user': 'test', 'password': 'test', 'database': 'xiaofeng', 'charset': 'utf8' } # url url = 'https://data.highpin.cn/api/JobSearch/Search' """ 爬取智联招聘职位数据 @:param page 页码 @:param position 职位关键字 """ def zhilian(page, position): # 封装头信息 headers = { 'Referer': 'https://www.highpin.cn/zhiwei/', 'Origin': 'https://www.highpin.cn', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', 'Accept': 'application/json, text/javascript, * / *; q=0.01', } # 表单信息 datas = { 'Q': position, 'pageIndex': page } resp = requests.post(url, data=datas, headers=headers) result = resp.json() return result """ 控制台输出 """ def print_data(result): body = result['body']['JobList'] print(body) """ 数据入库 """ def insert(result): print("insert......") database = db.connect(**mysql_config) for item in result: print(item) sql = "INSERT INTO zhilian(JobID,JobTitle,ReferrerType,CompanyName,AnnualSalaryMin," \ "AnnualSalaryMax,JobLactionStr,JobLactionID,JobTags\ ,JobDegree,JobDegreeId,WorkExperience,WorkExperienceID,CompanyIndustry,CompanyIndustryID," \ "CompanyType,CompanyTypeID,PublishDate,CompanyScale,SalaryWhite) \ VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)" # list convert to str JobLactionID = str(item['JobLactionID']) CompanyIndustryID = str(item['CompanyIndustryID']) if 'JobTags' in item: JobTags = str(item['JobTags']) else: JobTags = '' cursor = database.cursor() cursor.execute(sql, ( item['JobID'], item['JobTitle'], item['ReferrerType'], item['CompanyName'], item['AnnualSalaryMin'], item['AnnualSalaryMax'], item['JobLactionStr'], JobLactionID, JobTags, item['JobDegree'], item['JobDegreeId'], item['WorkExperience'], item['WorkExperienceID'], item['CompanyIndustry'], CompanyIndustryID, item['CompanyType'], item['CompanyTypeID'], item['PublishDate'], item['CompanyScale'], item['SalaryWhite'])) database.commit() cursor.close() database.close() def main(position): result = zhilian(1, position) page_count = result['body']['PageCount'] print("---------------共", page_count, "页-------------") page = 1 while page <= page_count: print('----------------第', page, '页-----------------') result = zhilian(page, position) # print_data(result) body = result['body']['JobList'] insert(body) page = page + 1 if __name__ == '__main__': main('java')
控制台输出信息
入库数据
2.python爬取csdn博客文章
python简单爬取csdn博客文章列表(仅供学习)
步骤:
1.分页获取博客url
2.解析html 获取指定信息
# !/usr/bin/env python # -*-coding:utf-8-*- """ @Author : xiaofeng @Time : 2018/12/20 11:30 @Desc : Less interests,More interest.(爬取csdn博客文章列表) @Project : python_appliction @FileName: csdn.py @Software: PyCharm @Blog :https://blog.csdn.net/zwx19921215 """ import requests from lxml import html # 声明头信息 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0' } """ 解析html,获取相关数据 @:param url 网页路径 """ def parse_html(url): response = requests.get(url=url, headers=headers) text = html.fromstring(response.text) hrefs = text.xpath('//h4/a/@href') title = text.xpath('//h4/a/text()') """ 文章摘要、发布日期、阅读数、评论数等其余属性自行按需获取即可 """ # 移出第一个元素链接,不知道为啥 csdn 博客默认都多了一条数据,也就多了一个链接 hrefs.pop(0) titles = [] # 格式化标题 for item in title: st = str(item).replace('\n', '').strip() if st != '': titles.append(st) # 组合输出 # 移出第一个元素,不知道为啥 csdn 博客默认都多了一条数据 “帝都的凛冬” titles.pop(0) i = 0 for item in titles: results = { '标题': titles[i], '链接': hrefs[i] } i = i + 1 print(results) """ 自动判断页数 @:param page_url 页面路径 @:param page 页号 """ def get_page(page_url, page): url = page_url + str(page) print('url=', url) response = requests.get(url=url, headers=headers) text = html.fromstring(response.text) next_page = text.xpath( '//div[@class="ui-paging-container"]/ul/li[@class="js-page-next js-page-action ui-pager"]/text()') if next_page: parse_html(url) page = page + 1 get_page(page_url, page) else: return -1 """ 分页爬取相关数据 @:param page_url 页面路径 @:param page 页号 """ def get_page2(page_url, page): url = page_url + str(page) while page <= 10: print('\n') print("----------------------第", page, "页--------------------") print('url=', url) print('\n') parse_html(url) page = page + 1 url = page_url + str(page) if __name__ == '__main__': page_url = 'https://blog.csdn.net/zwx19921215/article/list/' get_page2(page_url, 1)
希望能帮到你们,最后分享一些小福利
链接:https://pan.baidu.com/s/1sMxwTn7P2lhvzvWRwBjFrQ
提取码:kt2v
链接容易被举报过期,如果失效了就在这里领取吧