3. 接下来使用selenium+chromedriver调用浏览器,使用beautifuisoup解析页面爬取动态页面数据:
最后展示全部代码:
import pyodbc
import pymssql
import socket
import _mssql
import uuid
from multiprocessing import Lock, Pool
import multiprocessing
import decimal
from DBUtils.PooledDB import PooledDB
import time
from apscheduler.schedulers.blocking import BlockingScheduler
from apscheduler.triggers.interval import IntervalTrigger
import threading
import re
import os
import pytz
import time
import configparser
import html
import urllib
import uuid
from urllib import request
import requests
import datetime
from bs4 import BeautifulSoup
from k_v import k_v
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from fake_useragent import UserAgent
from random import choice
UserHeader = [
“Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36”,
“Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2226.0 Safari/537.36”,
“Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36”,
“Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36”,
“Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2224.3 Safari/537.36”,
“Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.93 Safari/537.36”,
“Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36”,
“Mozilla/5.0 (Windows NT 4.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36”,
“Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36”,
“Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36”,
“Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.3319.102 Safari/537.36”,
“Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36”,
“Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2309.372 Safari/537.36”,
“Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2117.157 Safari/537.36”,
“Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1866.237 Safari/537.36”,
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.116 Safari/537.36 ",
“Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36”,
“Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.16 Safari/537.36”,
“Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1623.0 Safari/537.36”,
“Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.17 Safari/537.36”,
“Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36”,
“Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.2 Safari/537.36”,
“Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1468.0 Safari/537.36”,
“Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1467.0 Safari/537.36”,
“Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0 Safari/537.36”,
“Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1500.55 Safari/537.36”,
“Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36”,
“Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36”,
“Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36”,
“Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36”,
“Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.90 Safari/537.36”,
“Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.60 Safari/537.17”,
“Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.15 (KHTML, like Gecko) Chrome/24.0.1295.0 Safari/537.15”,
“Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.14 (KHTML, like Gecko) Chrome/24.0.1292.0 Safari/537.14”]
def sql_lianjie():
cf = configparser.ConfigParser()
cf.read("./run.ini")
db_Host = cf.get(“db”, “db_ip”)
db_post = cf.get(“db”, “db_post”)
db_user = cf.get(“db”, “db_user”)
db_password = cf.get(“db”, “db_password”)
db_database = cf.get(“db”, “db_database”)
db_charset = cf.get(“db”, “db_charset”)
# 创建数据库连接池
pool = PooledDB(creator=pymssql, mincached=3, maxshared=9, maxconnections=6, blocking=False,host=db_Host, port=int(db_post), user=db_user, password=db_password, database=db_database, charset=db_charset)
if not pool:
raise (NameError, ‘连接数据库失败’)
else:
print(‘连接成功’)
return pool
def func(listTemp, n):
aaa = []
for i in range(0, len(listTemp), n):
aaa.append(listTemp[i:i + n])
return aaa
def running():
scheduler = BlockingScheduler()
c1 = IntervalTrigger(minutes=1)
scheduler.add_job(start_object, c1)
scheduler.start()
def start_object():
# 当程序开始时,在日志文件载入记录
with open(’./all.log’, ‘a’, encoding=‘utf-8’) as LOG:
shijian = datetime.datetime.now()
LOG.write(str(shijian)[:-7] + ‘-’ + ‘---------------------------爬虫开始运行---------------------------’ + ‘\n’)
LOG.close()
pool = sql_lianjie()
conn = pool.connection()
cur = conn.cursor()
# 读取配置文件
cf = configparser.ConfigParser()
cf.read("./run.ini")
FJ_duankou = cf.get(“db”, “db_host”)
FJ_sql = cf.get(“db”, “db_sql”)
db_adress = cf.get(“db”, “db_adress”)
# 信息网区域
cur.execute(FJ_sql)
SQLsession = cur.fetchall()
cur.close()
conn.close()
# 每个线程数据传输100条
temp = func(SQLsession, 100)
# 启用多线程
thread_list = []
for i in temp:
t = threading.Thread(target=fun1, args=(tuple(i), FJ_duankou, db_adress, pool, UserHeader))
t.setDaemon(True)
thread_list.append(t)
for t in thread_list:
t.start()
time.sleep(5)
for t in thread_list:
t.join()
# 等待运行完成之后在去文件写入‘运行成功’
with open(’./all.log’, ‘a’, encoding=‘utf-8’) as LOG:
shijian = datetime.datetime.now()
LOG.write(str(shijian)[:-7] + ‘-’ + ‘---------------------------运行成功,正在等待重启---------------------------’ + ‘\n\n\n’)
LOG.close()
time.sleep(180)
def fun1(SQLsession,FJ_duankou,db_adress,pool, UserHeader):
for get_SQLsession in SQLsession:
# 查册网版块
ChaPlate = get_SQLsession[3]
# 信息网区域
GovArea = get_SQLsession[2]
# 查册网区域
ChaArea = get_SQLsession[4]
# 附件下载地址
FJ_adress = FJ_duankou + db_adress
# url地址
url = get_SQLsession[1]
# print(url)
# 每个公告的id拼接
ID = get_SQLsession[5]
# 标题
TITLE = get_SQLsession[6]
# 发布时间
PUBLISHTIME = get_SQLsession[7]
# 内容
CONTENT = get_SQLsession[8]
# 附件
FJ_Download = get_SQLsession[9]
# 最后一次获取ID
LastGetID = get_SQLsession[10]
# Mainid获取
ConfigId = get_SQLsession[0]
# 拼接ID
ID_Splicing = get_SQLsession[11]
# 拼接附件
FJ_Splicing = get_SQLsession[12]
# 获取请求头get_SQLsession[13]
User_agent = choice(UserHeader)
header = {‘User-Agent’: User_agent}
get_url(url, ID, TITLE, PUBLISHTIME, CONTENT, GovArea, ChaArea, FJ_Download, FJ_adress, LastGetID, ChaPlate, ConfigId, db_adress, header, ID_Splicing, FJ_Splicing, pool, UserHeader)
def get_url(url, ID, TITLE, PUBLISHTIME, CONTENT, GovArea, ChaArea, FJ_Download, FJ_adress, LastGetID, ChaPlate, ConfigId, db_adress, header, ID_Splicing, FJ_Splicing, pool, UserHeader):
try:
socket.setdefaulttimeout(20)
# 随机生成请求头user-agent
response = requests.get(url, headers=header, timeout=20)
response.encoding = ‘utf-8’
time.sleep(3)
soup = BeautifulSoup(response.text, ‘html.parser’)
try:
time.sleep(1)
Url = soup.select(ID)
time.sleep(1)
Session = 0
for i in Url:
Get_ID = re.findall(r’\d+?.[a-z]+?.\d+\d+?.[a-z]{3,5}|20\d+/\S+.[a-z]{3,5}|content/post\d+.?[a-z]{3,4}|adetail\d+_\d+|.?p=\d+|HTML/zwgk\S+.?[a-z]{3,4}|\d+article\d+|aspx.?id=\d+|.asp.?id=\d+|.jsp.?trid=\d+’, str(i.get(‘href’)))
if Get_ID:
New_url = ID_Splicing + Get_ID[0]
if LastGetID == Get_ID[0]:
print(’---------------页面未更新------------’)
break
else:
Session += 1
if Session == 1:
conn = pool.connection()
cur = conn.cursor()
sql = “UPDATE dbo.TDOA_CcwNewsConfig SET LastGetId=’%s’ WHERE Url=’%s’” % (Get_ID[0], url)
cur.execute(sql)
print(‘记录%s条ID成功’ % Session)
conn.commit()
cur.close()
conn.close()
# =====================================================
print(New_url)
open_url(New_url, TITLE, PUBLISHTIME, CONTENT, GovArea, ChaArea, url, FJ_Download, FJ_adress, Get_ID, ChaPlate, ConfigId, db_adress, FJ_Splicing, Url, ID_Splicing, pool, UserHeader)
else:
Get_ID = re.findall(r'\d+\S+.[a-z]{3,5}.?id=\d+|\d+\S+.[a-z]{3,5}|20\S+.[a-z]{3,5}|.php.?[a-z]{2,3}=\d+\S+|Info.?id=\S+|.php.?Sid=\S+|Archive.aspx.?Id=\d+|aspx.?id=\d+|\.action.?id=\d+|id=\d+&chid=\d+', str(i.get('href')))
if Get_ID:
New_url = ID_Splicing + Get_ID[0]
if LastGetID == Get_ID[0]:
print('---------------页面未更新------------')
break
else:
Session += 1
if Session == 1:
conn = pool.connection()
cur = conn.cursor()
sql = "UPDATE dbo.TDOA_CcwNewsConfig SET LastGetId='%s' WHERE Url='%s'" % (
Get_ID[0], url)
cur.execute(sql)
print('记录%s条ID成功' % Session)
conn.commit()
cur.close()
conn.close()
# =====================================================
print(New_url)
open_url(New_url, TITLE, PUBLISHTIME, CONTENT, GovArea, ChaArea, url, FJ_Download, FJ_adress, Get_ID, ChaPlate, ConfigId, db_adress, FJ_Splicing, Url, ID_Splicing, pool, UserHeader)
else:
print('-----------------ID获取不到-----------------')
continue
except Exception as e:
Logger(e, url)
print(e)
pass
except Exception as e:
Logger(e, url)
print(e)
pass
def open_url(New_url, TITLE, PUBLISHTIME, CONTENT, GovArea, ChaArea, url, FJ_Download, FJ_adress, Get_ID, ChaPlate, ConfigId, db_adress, FJ_Splicing, Url, ID_Splicing, pool, UserHeader):
try:
chrome_options = Options()
header = choice(UserHeader)
chrome_options.add_argument(‘user-agent="%s"’ % header)
# 设置打开浏览器不加载图片
chrome_options.add_argument(’–blink-settings=imagesEnabled=false’)
# 设置浏览器不提供可视化(不弹窗)
chrome_options.add_argument(’–headless’)
# 设置禁止GPU加速
chrome_options.add_argument(’–disable-gpu’)
driver = webdriver.Chrome(executable_path=‘C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe’,chrome_options=chrome_options)
driver.set_page_load_timeout(20)
driver.get(New_url)
content = driver.page_source.encode(‘gb18030’)
time.sleep(3)
driver.quit()
soup = BeautifulSoup(content, ‘html.parser’, from_encoding=“gb18030”)
Title = get_Title(soup, TITLE, ChaArea)
PublishTime = get_PublishTime(soup, PUBLISHTIME, New_url)
Abstract = get_Abstract(soup, CONTENT)
Content = get_txt(soup, CONTENT, PublishTime, url, FJ_adress, Title)
time.sleep(2)
new_list = get_FJ(soup, url, FJ_Download, FJ_adress, PublishTime, Title, Get_ID, db_adress, FJ_Splicing, header, ID_Splicing)
time.sleep(2)
start_sql(Title, PublishTime, GovArea, ChaArea, Content, New_url, Abstract, new_list, ChaPlate, ConfigId, url, pool)
except Exception as e:
Logger(e, New_url)
pass
def get_Title(soup, TITLE, ChaArea):
try:
a = ‘【’ + k_v[ChaArea] + ‘】’
except:
a = ‘【’ + ‘】’
Title = a+soup.select(TITLE)[0].text.strip()
if ‘字号:’ in Title:
Title = Title.replace(‘字号:’, ‘’).strip()
print("------------标题:" + Title)
return Title
else:
print("------------标题:" + Title)
return Title
def get_PublishTime(soup, PUBLISHTIME, New_url):
try:
if PUBLISHTIME == ‘None’:
return ‘’
PublishTime = re.search(r’\d{4}[/:-]\d{1,2}[/:-]\d{1,2}\s\d{1,2}[/:-]\d{1,2}[/:-]\d{1,2}|\d{4}[/:-]\d{1,2}[/:-]\d{1,2}\s\d{1,2}[/:-]\d{1,2}|\d{4}[/:-]\d{1,2}[/:-]\d{1,2}|\d{4}[年]\d{1,2}[月]\d{1,2}[日]’,soup.select(PUBLISHTIME)[0].text)[0]
if PublishTime:
if ‘年’ in PublishTime:
PublishTime = PublishTime.replace(‘年’, ‘-’).replace(‘月’, ‘-’).replace(‘日’, ‘’)
print(“时间:” + PublishTime)
return PublishTime
else:
print(“时间:” + PublishTime)
return PublishTime
else:
print(‘时间获取不到’)
except Exception as e:
Logger(e, New_url)
PublishTime = New_url.split(’/’)[-1]
PublishTime = PublishTime[1:5] + ‘-’ + PublishTime[5:7] + ‘-’ + PublishTime[7:9]
print(“时间:” + PublishTime)
return PublishTime
def get_Abstract(soup, CONTENT):
return ‘’
# Txt = soup.select(CONTENT)
# if Txt:
# c = []
# for i in Txt:
# c.append(i.text)
# Content = ‘\n’.join©
# if Content:
# Abstract = Content[:100].replace(’ ', ’ ‘).replace(’ ', ’ ')
# print(“摘要:”+Abstract)
# return Abstract
def get_txt(soup, CONTENT, PublishTime, url, FJ_adress, Title):
try:
Txt = soup.select(CONTENT)[0]
if Txt:
c = []
for i in Txt:
Script_BQ = re.findall(’