本人初学python,参考网络资源编写的爬虫功能,定时隔2小时爬取百度热搜的热点话题(话题类型,话题标题,图片地址,话题地址,序号,热度,话题内容等)并将数据存入数据库,方便后续统计,数据分析等...
目录结构使用java的结构(勿喷),具体实现如下:
1.数据库设计:
CREATE TABLE `baidu_hot_main` (
`id` varchar(36) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NOT NULL COMMENT '主键',
`address` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL COMMENT '地址',
`position` varchar(100) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL COMMENT '位置',
`times` varchar(20) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL COMMENT '次数',
`header` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL COMMENT '请求头',
`describe` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL COMMENT '描述',
`date` datetime DEFAULT NULL COMMENT '日期',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci;
CREATE TABLE `baidu_hot_content` (
`id` varchar(36) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NOT NULL COMMENT '主键',
`main_id` varchar(36) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL COMMENT '关联',
`top` int DEFAULT NULL COMMENT '序号',
`title` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL COMMENT '标题',
`href` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL COMMENT '内容地址',
`content` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL COMMENT '内容',
`type` varchar(10) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL COMMENT '类型',
`img` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL COMMENT '图片地址',
`index` varchar(25) DEFAULT NULL COMMENT '热点指数',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci;
2.配置文件:
config配置文件配置数据库链接配置,网页爬取配置。
#数据库配置
host = ("127.0.0.1")#url
port = (3306)#端口号
user = ("root")#数据库用户
password = ("root")#数据库密码
database = ("pachong")#要连接的数据库名称
#配置网页信息2
url1 = ''
headers1={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'}
cookies1 = ''
params1=""
contentA1 = "#sanRoot > main > div.container.right-container_2EFJr > div > div:nth-child(2) > div "
url1:为爬取网站地址
headers1:内配置虚拟浏览器,模拟浏览器访问,防止反爬虫。
contentA1:为爬取的html位置,具体配置需要在网页控制台查看
contentA1 获取:页面右键控制台,找到元素中要爬取的div,右键复制,选择selector的值。
3.controller
from reptile.Config import url1, contentA1, headers1, cookies1, params1
from reptile.service.WebHandleService import runRepitleMain
#封装到类里,方便继承
class urlConfig():
#配置网页信息----
url = url1
contentA = contentA1
headers= headers1
cookies = cookies1
params= params1
def runRepitle():
runRepitleMain(urlConfig)
4.service
import time
import requests
import uuid
from bs4 import BeautifulSoup
from reptile.dao.BaiduHotContentDao import setBatchContent, setContent
from reptile.dao.BaiduHotMainDao import setMain, getTimes
#获取网页资源
#爬baidu
def runRepitleMain(urlConfig):
response = requests.get(urlConfig.url,headers=urlConfig.headers)
response.encoding = 'utf-8'
#定义选择器
soup = BeautifulSoup(response.text, 'html.parser')
id1 = uuid.uuid1()
date = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))
setMainTab(urlConfig.url,urlConfig.contentA,urlConfig.headers,id1,date)
a = soup.select(urlConfig.contentA)
# 插入的数据
data=[]
for i in range(0,len(a)):
# (id, main_id, top, title, href, content,type,img,index))
# 主键id
id2 = uuid.uuid1()
# 热点排序
top0 = a[i].find("a").find("div").text.strip()
# 标题
title = a[i].find(name="div",attrs={"class":"content_1YWBm"}).find("a").text.strip()
# 热点标志
type = a[i].find(name="div",attrs={"class":"hot-tag_1G080"}).text.strip()
# 链接
href = a[i].find("a").get("href")
# 图片链接
img = a[i].find("a").find("img").get("src")
# 热搜点数
index = a[i].find(name="div",attrs={"class":"hot-index_1Bl1a"}).text.strip()
# 部分内容
content = a[i].find(name="div",attrs={"class":"large_nSuFU"}).text.replace("查看更多>",'').strip()
#三元表达式
top = ("0",top0)[len(top0) !=0 ]
print("爬取第"+top+"条数据,内容:title-"+title+"type-"+type)
# 单条插入
data.append((id2, id1,top, title,href,content,type,img,index))
#批量插入
setBatchContent(data)
return "爬取成功";
#将该事件存入数据库main表
def setMainTab(url,contentA,headers,id1,date):
times = getTimes(url,"百度热搜爬取内容")+1
print("共计:"+str(times)+"次")
setMain(id1,str(url),str(contentA),str(times),str(headers),"百度热搜爬取内容",date)
#将爬取内容存入数据库content表
5.dao
import pymysql
from reptile import Config
sql_insert_baidu_hot_content = '''insert into baidu_hot_content (id,main_id,top,title,href,content,type,img) values(%s,%s,%s,%s,%s,%s,%s,%s)'''
sql_insert_baidu_hot_content_batch = '''insert into baidu_hot_content (id,main_id,top,title,href,content,type,img,`index`) values(%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
#插入表
def setContent(id,main_id, top, title, href, content,type,img):
#连接数据库
db= pymysql.connect(
host=Config.host,#url
port=Config.port,#端口号
user=Config.user,#数据库用户
password=Config.password,#数据库密码
database=Config.database#要连接的数据库名称
)
#连接串
cursor = db.cursor()
try:
#执行sql语句
cursor.execute(sql_insert_baidu_hot_content, (id, main_id, top, title, href, content,type,img))
#提交事务
db.commit()
print('插入成功')
except Exception as e:
print(e)
#如果出现异常,回滚
db.rollback()
print('插入失败')
finally:
#关闭数据库连接
db.close()
return 1
#批量插入表
def setBatchContent(dataList):
#连接数据库
db= pymysql.connect(
host=Config.host,#url
port=Config.port,#端口号
user=Config.user,#数据库用户
password=Config.password,#数据库密码
database=Config.database#要连接的数据库名称
)
#连接串
cursor = db.cursor()
try:
#执行sql语句
cursor.executemany(sql_insert_baidu_hot_content_batch,dataList)
#提交事务
db.commit()
print('插入成功')
except Exception as e:
print(e)
#如果出现异常,回滚
db.rollback()
print('插入失败')
finally:
#关闭数据库连接
db.close()
return 1
import pymysql
from reptile import Config
#连接串
sql_insert_baidu_hot_main = '''insert into baidu_hot_main(id,address,`position`,times,header,`date`,`describe`) values(%s,%s,%s,%s,%s,%s,%s)'''
sql_getTimes_baidu_hot_main = "select id from baidu_hot_main where address=%s and `describe` =%s"
#查询当前访问次数
def getTimes(address,describe):
try:
#连接数据库
db= pymysql.connect(
host=Config.host,#url
port=Config.port,#端口号
user=Config.user,#数据库用户
password=Config.password,#数据库密码
database=Config.database#要连接的数据库名称
)
cursor = db.cursor()
#执行sql语句
sum = cursor.execute(sql_getTimes_baidu_hot_main, (address,describe))
print('describe='+describe+';;;;address='+address)
#提交事务
db.commit()
print('查询次数成功')
except Exception as e:
print(e)
#如果出现异常,回滚
db.rollback()
print('查询次数失败')
finally:
#关闭数据库连接
db.close()
return sum
#插入表
def setMain(id,address,position,times,header,describe,date):
try:
#连接数据库
db= pymysql.connect(
host=Config.host,#url
port=Config.port,#端口号
user=Config.user,#数据库用户
password=Config.password,#数据库密码
database=Config.database#要连接的数据库名称
)
cursor = db.cursor()
#执行sql语句
cursor.execute(sql_insert_baidu_hot_main, (id,address,position,times,header,date,describe))
#提交事务
db.commit()
print('插入成功')
except Exception as e:
print(e)
#如果出现异常,回滚
db.rollback()
print('插入失败')
finally:
#关闭数据库连接
db.close()
return 1