mysql py@localhost:(none)> USE spider;
mysql py@localhost:spider> CREATE TABLE university (id INT NOT NULL AUTO_INCREMENT
-> ,name VARCHAR(30) NOT NULL,address VARCHAR(20) NOT NULL
-> ,score FLOAT NOT NULL,PRIMARY KEY(id));
-- 在spider数据库中新建一个university表,并设置id,name,address,score字段
注意设置MySQL的编码为UTF-8
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
# @Version: v1.0
# @License: Apache Licence 2.0
# @File Name: CrawUnivRankingC.py
# @Description: 爬取2018年中国大学排名并写入MySQL数据库
# @Author: pengshp
# @Date: 2018/10/17 0017
# &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
import requests
from bs4 import BeautifulSoup
import bs4
import pymysql
def getHTMLText(url):
"""获取网页HTML"""
kv = {
'User-Agent': 'Mozilla/5.0'}
try:
r = requests.get(url, headers=kv, timeout=20)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except requests.HTTPError:
print("爬取异常")
def fillUnivList(ulist, html):
"""获取HTML中的数据信息"""
soup = BeautifulSoup(html, "html.parser")
for tr in soup.find('tbody').children:
if isinstance(tr, bs4.element.Tag):
tds = tr('td')
ulist.append([tds[0].string, tds[1].string, \
tds[2].string, tds[3].string])
def printUnivList(ulist, num):
"""打印测试"""
print("{:^10}\t{:^6}\t{:^10}".format("排名", "学校名称", "总分"))
for i in range(num):
u = ulist[i]
print("{:^10}\t{:^6}\t{:^10}".format(u[0], u[1], u[2], u[3]))
def writeDB(ulist, num):
"""写入MySQL数据库"""
# host,user,password,database
db = pymysql.connect("192.168.10.30", "py", "xxxxxxxx", "spider")
cur = db.cursor()
try:
for i in range(num):
u = ulist[i]
sql = "INSERT INTO `university` (name,address,score) VALUES ('%s','%s','%1.f')" \
% (str(u[1]), str(u[2]), float(u[3]))
cur.execute(sql)
db.commit()
except:
db.rollback() # 若发生错误则回滚
cur.close()
db.close()
def main():
uinfo = []
url = 'http://www.zuihaodaxue.cn/zuihaodaxuepaiming2018.html'
html = getHTMLText(url)
fillUnivList(uinfo, html)
writeDB(uinfo, 500) # 写入前500名的数据
print("数据写入成功!")
# printUnivList(uinfo, num=20)
if __name__ == '__main__':
main()
mysql py@localhost:spider> SELECT * FROM university WHERE id <= 10;
前十有我的学校,O(∩_∩)O哈哈~
数据来源为上海交通大学发布的中国最好大学排行榜。
1、最好大学网:http://www.zuihaodaxue.cn
2、pymysql文档:https://pymysql.readthedocs.io/en/latest/index.html