python实现爬虫爬取百度热搜汇总

本人初学python,参考网络资源编写的爬虫功能,定时隔2小时爬取百度热搜的热点话题(话题类型,话题标题,图片地址,话题地址,序号,热度,话题内容等)并将数据存入数据库,方便后续统计,数据分析等...
目录结构使用java的结构(勿喷),具体实现如下:

目录结构

1.数据库设计:

CREATE TABLE `baidu_hot_main` (
  `id` varchar(36) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NOT NULL COMMENT '主键',
  `address` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL COMMENT '地址',
  `position` varchar(100) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL COMMENT '位置',
  `times` varchar(20) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL COMMENT '次数',
  `header` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL COMMENT '请求头',
  `describe` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL COMMENT '描述',
  `date` datetime DEFAULT NULL COMMENT '日期',
  PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci;


CREATE TABLE `baidu_hot_content` (
  `id` varchar(36) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NOT NULL COMMENT '主键',
  `main_id` varchar(36) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL COMMENT '关联',
  `top` int DEFAULT NULL COMMENT '序号',
  `title` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL COMMENT '标题',
  `href` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL COMMENT '内容地址',
  `content` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL COMMENT '内容',
  `type` varchar(10) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL COMMENT '类型',
  `img` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL COMMENT '图片地址',
  `index` varchar(25) DEFAULT NULL COMMENT '热点指数',
  PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci;

2.配置文件:

config配置文件配置数据库链接配置,网页爬取配置。

#数据库配置
host = ("127.0.0.1")#url
port = (3306)#端口号
user = ("root")#数据库用户
password = ("root")#数据库密码
database = ("pachong")#要连接的数据库名称

#配置网页信息2
url1 = ''
headers1={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'}
cookies1 = ''
params1=""
contentA1 = "#sanRoot > main > div.container.right-container_2EFJr > div > div:nth-child(2) > div "

url1:为爬取网站地址
headers1:内配置虚拟浏览器,模拟浏览器访问,防止反爬虫。
contentA1:为爬取的html位置,具体配置需要在网页控制台查看


image.png

contentA1 获取:页面右键控制台,找到元素中要爬取的div,右键复制,选择selector的值。

3.controller

from reptile.Config import url1, contentA1, headers1, cookies1, params1
from reptile.service.WebHandleService import runRepitleMain


#封装到类里,方便继承
class urlConfig():
    #配置网页信息----
    url = url1
    contentA = contentA1
    headers= headers1
    cookies = cookies1
    params= params1

def runRepitle():
    runRepitleMain(urlConfig)

4.service

import time
import  requests
import uuid
from bs4 import BeautifulSoup
from reptile.dao.BaiduHotContentDao import setBatchContent, setContent
from reptile.dao.BaiduHotMainDao import setMain, getTimes

#获取网页资源

#爬baidu
def runRepitleMain(urlConfig):
    response = requests.get(urlConfig.url,headers=urlConfig.headers)
    response.encoding = 'utf-8'
    #定义选择器
    soup = BeautifulSoup(response.text, 'html.parser')
    id1 = uuid.uuid1()
    date = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))
    setMainTab(urlConfig.url,urlConfig.contentA,urlConfig.headers,id1,date)
    a = soup.select(urlConfig.contentA)
    # 插入的数据
    data=[]
    for i in range(0,len(a)):
        # (id, main_id, top, title, href, content,type,img,index))
        # 主键id
        id2 = uuid.uuid1()
        # 热点排序
        top0 = a[i].find("a").find("div").text.strip()
        # 标题
        title = a[i].find(name="div",attrs={"class":"content_1YWBm"}).find("a").text.strip()
        # 热点标志
        type = a[i].find(name="div",attrs={"class":"hot-tag_1G080"}).text.strip()
        # 链接
        href = a[i].find("a").get("href")
        # 图片链接
        img = a[i].find("a").find("img").get("src")
        # 热搜点数
        index = a[i].find(name="div",attrs={"class":"hot-index_1Bl1a"}).text.strip()
        # 部分内容
        content = a[i].find(name="div",attrs={"class":"large_nSuFU"}).text.replace("查看更多>",'').strip()
        #三元表达式
        top = ("0",top0)[len(top0) !=0 ]
        print("爬取第"+top+"条数据,内容:title-"+title+"type-"+type)
        # 单条插入
        data.append((id2, id1,top, title,href,content,type,img,index))
    #批量插入
    setBatchContent(data)
    return "爬取成功";

#将该事件存入数据库main表
def setMainTab(url,contentA,headers,id1,date):
    times = getTimes(url,"百度热搜爬取内容")+1
    print("共计:"+str(times)+"次")
    setMain(id1,str(url),str(contentA),str(times),str(headers),"百度热搜爬取内容",date)
#将爬取内容存入数据库content表

5.dao

import pymysql
from reptile import Config

sql_insert_baidu_hot_content = '''insert into baidu_hot_content (id,main_id,top,title,href,content,type,img) values(%s,%s,%s,%s,%s,%s,%s,%s)'''
sql_insert_baidu_hot_content_batch = '''insert into baidu_hot_content (id,main_id,top,title,href,content,type,img,`index`)  values(%s,%s,%s,%s,%s,%s,%s,%s,%s)'''



#插入表
def setContent(id,main_id, top, title, href, content,type,img):
    #连接数据库
    db= pymysql.connect(
        host=Config.host,#url
        port=Config.port,#端口号
        user=Config.user,#数据库用户
        password=Config.password,#数据库密码
        database=Config.database#要连接的数据库名称
    )
    #连接串
    cursor = db.cursor()
    try:
        #执行sql语句
        cursor.execute(sql_insert_baidu_hot_content, (id, main_id, top, title, href, content,type,img))
        #提交事务
        db.commit()
        print('插入成功')
    except Exception as e:
        print(e)
        #如果出现异常,回滚
        db.rollback()
        print('插入失败')
    finally:
        #关闭数据库连接
        db.close()

    return 1


#批量插入表
def setBatchContent(dataList):
    #连接数据库
    db= pymysql.connect(
        host=Config.host,#url
        port=Config.port,#端口号
        user=Config.user,#数据库用户
        password=Config.password,#数据库密码
        database=Config.database#要连接的数据库名称
    )
    #连接串
    cursor = db.cursor()
    try:
        #执行sql语句
        cursor.executemany(sql_insert_baidu_hot_content_batch,dataList)
        #提交事务
        db.commit()
        print('插入成功')
    except Exception as e:
        print(e)
        #如果出现异常,回滚
        db.rollback()
        print('插入失败')
    finally:
        #关闭数据库连接
        db.close()

    return 1
import pymysql
from reptile import Config


#连接串
sql_insert_baidu_hot_main =  '''insert into baidu_hot_main(id,address,`position`,times,header,`date`,`describe`) values(%s,%s,%s,%s,%s,%s,%s)'''
sql_getTimes_baidu_hot_main = "select id from baidu_hot_main where address=%s and  `describe` =%s"


#查询当前访问次数
def getTimes(address,describe):

    try:
        #连接数据库
        db= pymysql.connect(
            host=Config.host,#url
            port=Config.port,#端口号
            user=Config.user,#数据库用户
            password=Config.password,#数据库密码
            database=Config.database#要连接的数据库名称
        )
        cursor = db.cursor()
        #执行sql语句
        sum = cursor.execute(sql_getTimes_baidu_hot_main, (address,describe))
        print('describe='+describe+';;;;address='+address)
        #提交事务
        db.commit()
        print('查询次数成功')
    except Exception as e:
        print(e)
        #如果出现异常,回滚
        db.rollback()
        print('查询次数失败')
    finally:
        #关闭数据库连接
        db.close()

    return sum



#插入表
def setMain(id,address,position,times,header,describe,date):

    try:
        #连接数据库
        db= pymysql.connect(
            host=Config.host,#url
            port=Config.port,#端口号
            user=Config.user,#数据库用户
            password=Config.password,#数据库密码
            database=Config.database#要连接的数据库名称
        )
        cursor = db.cursor()
        #执行sql语句
        cursor.execute(sql_insert_baidu_hot_main, (id,address,position,times,header,date,describe))
        #提交事务
        db.commit()
        print('插入成功')
    except Exception as e:
        print(e)
        #如果出现异常,回滚
        db.rollback()
        print('插入失败')
    finally:
        #关闭数据库连接
        db.close()

    return 1

后期更新如何做定时任务及打包部署...

你可能感兴趣的:(python实现爬虫爬取百度热搜汇总)