最近忙着做毕业设计,最爱的python当然成了我的切入点。因此特意找了一个项目来练练手,项目很简单,就是利用python爬取数据,然后再利用matplotlib进行数据可视化。
项目设计:爬虫爬取数据并存入mongodb数据库中,然后再编写脚本读取数据,进行绘图。
一:数据爬取(以腾讯招聘为列)
这四个类别就是我们的爬取目标
然后看一看网页源代码,准备利用xpath提取想要的元素
可以看到有两个不同的class属性,为了将两个都提取出来,我们只需要使用xpath的运算符“ | ”,计算两个节点的集
(//tr[@class="even"] | //tr[@class="odd"])//a/text()
其他不懂的,可以参考我的另外两篇博文:
python爬虫攻略(6):lxml与xpath
http://blog.csdn.net/katyusha1/article/details/78377887
python3爬虫攻略(7):爬虫案例
http://blog.csdn.net/katyusha1/article/details/78380421
爬虫源代码如下
# -*- coding:utf-8 -*-
import requests
from lxml import etree
import time
import pymongo
import random
class Tencent(object):
def __init__(self):
'''
初始化url
网页页码下标地址
爬虫控制开关
'''
self.url="http://hr.tencent.com/position.php?&start="
self.index=0
self.switch=True
#创建一个列表用来存储tencent招聘信息
self.tencent_data=[]
def mongodb(self):
'''
创建mongodb对象
连接mongodb
'''
client=pymongo.MongoClient(host="localhost",port=27017)
db=client.py3
collection=db.tencent
for data in self.tencent_data:
collection.insert(data)
print("已将数据全部存入到mongodb中!")
def gethtml(self,url):
'''
加载html页面,并解析为xml文档
'''
headers_list=[
{"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0"},
{"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.87 Safari/537.36 OPR/37.0.2178.32"},
{"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36"},
{"User-Agent":"Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)"},
{"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36"},
{"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0"},
]
headers=random.choice(headers_list)
#捕获异常
try:
response=requests.get(url=url,headers=headers,timeout=20)
except:
print("have a error")
finally:
response=requests.get(url=url,headers=headers,timeout=20)
html=response.text
content=etree.HTML(html)
return content
def loadpage(self,url):
'''
利用xpaht获取信息,存入mongodb中
'''
content=self.gethtml(url)
#职位名称
job_title=content.xpath('(//tr[@class="even"] | //tr[@class="odd"])//a/text()')
#职位类别
job_category=content.xpath('//tr[@class="even"]//td[2]//text() | //tr[@class="odd"]//td[2]//text()')
#人数
number=content.xpath('//tr[@class="even"]//td[3]//text() | //tr[@class="odd"]//td[3]//text()')
#地点
location=content.xpath('//tr[@class="even"]//td[4]//text() | //tr[@class="odd"]//td[4]//text()')
#整合信息
info_list=zip(job_title,job_category,number,location)
for info in info_list:
#拼接成字典,方便存入mongodb
info={"job_title":info[0],"job_category":info[1],"number":info[2],"location":info[3]}
#将数据追加到一个list列表中
self.tencent_data.append(info)
print("正在获取数据"+"-"*10)
def startswitch(self):
'''
开关
'''
while self.switch==True:
#拼接url地址
tencenturl=self.url+str(self.index)
self.loadpage(tencenturl)
time.sleep(5)
#判断是否到了最后一页
if self.index<2500:
self.index+=10
else:
self.switch=False
#将数据存到mongodb中
self.mongodb()
print("程序结束")
if __name__ == '__main__':
tencent=Tencent()
tencent.startswitch()
二:数据可视化
源码
# -*- coding:utf-8 -*-
import pymongo
import matplotlib.pyplot as plt
import pylab
#让matplotlib显示中文
pylab.mpl.rcParams['font.sans-serif'] = ['SimHei']
#连接mongodb
client=pymongo.MongoClient(host="localhost",port=27017)
db=client.py3
collection=db.tencent
#获取信息条数
technology=collection.count({"job_category":"技术类"})
design=collection.count({"job_category":"设计类"})
product=collection.count({"job_category":"产品/项目类"})
market=collection.count({"job_category":"市场类"})
function=collection.count({"job_category":"职能类"})
edit=collection.count({"job_category":"内容编辑类"})
service=collection.count({"job_category":"客户服务类"})
#数值
values=[technology,design,product,market,function,edit,service]
#标签
labels=["技术类","设计类","产品/项目类","市场类","职能类","内容编辑类","客户服务类"]
#突出部分
explode=[0,0,0,0,0,0.2,0.5]
#颜色
colors=["yellow","blue","red","purple","green","brown","black"]
#标题
plt.title("招聘岗位类型比例",fontsize=25,color="red")
#标题
plt.pie(values,labels=labels,colors=colors,autopct="%1.2f%%",explode=explode)
plt.axis("equal")
plt.show()