一、网络爬虫
网络爬虫(又称为网页蜘蛛,网络机器人,在FOAF社区中间,更经常的称为网页追逐者),是一种按照一定的规则,自动地抓取万维网信息的程序或者脚本。另外一些不常使用的名字还有蚂蚁、自动索引、模拟程序或者蠕虫。
二、实验思路:
先从目标网页上获取数据,然后暂时接收数据,再连接数据库,将数据传输到数据库中,在数据库中用表格接收数据并展示数据。
三、实验代码:
import re
import pandas as pd
import requests
from numpy import *
import matplotlib.pyplot as plt
import pymysql
import matplotlib as mpl
#获取网页租房信息
def getData():
for i in range(1, 5):
print('正在爬取第%d页' % (i))
# 爬取成都链家网租房信息
baseurl = 'https://cd.lianjia.com/zufang/pg'
url = baseurl + str(i) + '/#contentList'
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
print(url)
response = requests.get(url, headers=header)
html = response.text
data_list = []
if response.status_code == 200:
regex = re.compile('title="(.*?)">\n.*?([0-9.]+)㎡\n.*?([0-9.]+) 元/月', re.DOTALL)
data = regex.findall(html)
data = list(data)
data_list.append(data)
# print(data_house)
data_house = pd.DataFrame(data_list)
return data_house, data_list
#提取信息
def extractData(data):
res = []
res_list = []
for i in range(0, len(data)):
res.append([data_list[0][i][0].split()[0][3:], data_list[0][i][2]])
res_list.append([data_list[0][i][0].split()[0][:2],
data_list[0][i][0].split()[0][3:],
data_list[0][i][0].split()[1],
data_list[0][i][0].split()[2],
data_list[0][i][1],
data_list[0][i][2]]
)
return res, res_list
#连接数据库
def connectData(data):
conn = pymysql.connect(
user="root",
port=3306,
passwd="011201",
db="test",
host="127.0.0.1",
charset='utf8'
)
if conn:
print('数据库连接成功')
error = 0
try:
cursor = conn.cursor()
num = 0
for item in data:
print(item)
num = num + 1
x0 = str(item[0])
x1 = str(item[1])
x2 = str(item[2])
x3 = str(item[3])
x4 = str(item[4])
x5 = str(item[5])
insert_re = f'insert into rent(number, ways,town_name, layout,towards,area,rent) values ({num}, \'{x0}\',\'{x1}\',\'{x2}\',\'{x3}\',\'{x4}\',\'{x5}\')'
print(insert_re)
print(type(insert_re))
cursor.execute(insert_re)
conn.commit()
except Exception as e:
error = error + 1
except UnicodeDecodeError as e:
error = error + 1
conn.close()
else:
print('数据库连接失败')
#显示数据
def showData(data):
res_1 = []
x, y = [], []
print(len(data))
for i in range(0, len(data)):
val = []
for j in range(i, len(data)):
if data_list[0][i][0].split()[0][3:] == data_list[0][j][0].split()[0][3:]:
val.append(int(data_list[0][j][2]))
res_1.append({data_list[0][i][0].split()[0][3:]: val})
x.append(data_list[0][i][0].split()[0][3:])
y.append(mean(val))
print(x)
print(len(x))
print(y)
print(len(y))
mpl.rcParams['font.sans-serif'] = ['SimHei']
mpl.rcParams['axes.unicode_minus'] = False
fig, ax = plt.subplots(figsize=(8, 4), dpi=100)
ax.set_xlabel('小区名')
ax.set_ylabel('平均租金')
ax.set_title('该小区与平均租金之间的关系')
bar = plt.bar(x, y, 0.5, color='coral', edgecolor='grey')
plt.xticks(x, x, rotation=90)
for a, b in zip(x, y):
plt.text(a, b,
b,
ha='center',
va='bottom',
)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.spines['left'].set_visible(False)
plt.show()
return x, y
def query(x, y):
name = '.*'.join(input('请输入小区名称:'))
regex = re.compile(name)
for i in range(0, len(x)):
match = regex.search(x[i])
if match:
print(f'该小区{x[i]}的平均租金为:{y[i]}')
# 主函数
if __name__ == '__main__':
data_house, data_list = getData()
print(data_list)
res, res_list = extractData(data_list[0])
connectData(res_list)
print(res)
x, y = showData(res)
while True:
query(x, y)
四、实验结果:
五、实验总结:
经过本次实验,我充分认识并了解网络爬虫的过程,学习到了使用python进行网络爬虫,体会到python语言在网络爬虫方面的优越性。除此之外,还体会到了正则表达式的简便。
在实验中,遇到了数据传入数据库创建表格后,表格接收不到数据的问题。查明后发现问题出在创建的表格与代码中信息名称和顺序没有对应上,改正后运行代码成功。