爬虫主方法
# -*- coding: utf-8 -*-
import sys
import urllib.request
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver import DesiredCapabilities
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import requests
import time
import random
import re
import os
import threading
import chardet
count1 = 0
lock = threading.Lock()
# 用requests方法发送请求,获取年鉴目录数据
def get_result(ybcode, page=1):
try:
# 设定params
data = {'ybcode': ybcode, 'entrycode': '', 'page': page, 'pagerow': '20',
'Referer': 'http://data.cnki.net/Yearbook'}
# 设定请求头
headers = {
'Content-Type': 'application/x-www-form-urlencoded',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
# Cookie或许需要不时更新
'Cookie': 'Ecp_ClientId=2201106155502682665; Ecp_LoginStuts={"IsAutoLogin":false,"UserName":"sh0292",'
'"ShowName":"%e4%b8%ad%e5%9b%bd%e7%9f%bf%e4%b8%9a%e5%a4%a7%e5%ad%a6%e5%9b%be%e4%b9%a6%e9%a6%86",'
'"UserType":"bk","BUserName":"","BShowName":"","BUserType":"","r":"R7eKrF"}; '
'c_m_LinID=LinID=WEEvREcwSlJHSldSdmVqeVpQWEhjK2JqNWVTcFpPTFJSTFVLMnUxWGMyQT0'
'=$9A4hF_YAuvQ5obgVAqNKPCYcEjKensW4IQMovwHtwkF4VYPoHbKxJw!!&ot=11/06/2020 17:39:14; '
'LID=WEEvREcwSlJHSldSdmVqeVpQWEhjK2JqNWVTcFpPTFJSTFVLMnUxWGMyQT0'
'=$9A4hF_YAuvQ5obgVAqNKPCYcEjKensW4IQMovwHtwkF4VYPoHbKxJw!!; c_m_expire=2020-11-06 17:39:14; '
'Hm_lvt_911066eb2f53848f7d902db7bb8ac4d7=1604650989,1604651116,1604651168,1604654428; '
'ASP.NET_SessionId=3d0xpwff2pt0exxcclmw3we4; SID=009023; '
'Hm_lpvt_911066eb2f53848f7d902db7bb8ac4d7=1604654428'
,
'Referer': 'https://login.cnki.net/login/?platform=kns&ForceReLogin=1&ReturnURL=https://www.cnki.net/',
}
# 访问查询年鉴目录的地址
url = "https://data.cnki.net/Yearbook/PartialGetCatalogResult"
# 对params进行编码
params = urllib.parse.urlencode(data).encode(encoding='utf-8')
# 封装请求
req = urllib.request.Request(url, params, headers)
# 发送请求并接收结果
r = urllib.request.urlopen(req)
# 读取结果
res = str(r.read(), 'utf-8')
# print(res)
return res
# 此处接收异常直接退出,方便监视程序重新调起爬虫
except Exception as e:
sys.exit(0)
# 获取总页数
def get_pageno(ybcode):
soup = BeautifulSoup(get_result(ybcode), 'lxml')
pages = int(soup.select('.s_p_listl')[0].get_text().split("共")[2].split('页')[0])
print('总共' + str(pages) + '页')
return pages
# 数据的清理,除去文本中所有的\n和\r
def dataclear(data):
data = re.sub('\n+', '', data)
data = re.sub('\r+', '', data)
data = re.sub(' +', '', data)
data = re.sub('>+', '-', data)
return data
# 下载知网的统计年鉴之类的所有excel表
def filedata(yearBook, yearBookName):
ybcode = yearBook.get('ybcode')
dictionaryName = os.getcwd() + '/' + yearBook.get('year') + yearBookName
dictionaryName = dataclear(dictionaryName)
pageno = get_pageno(ybcode)
print(os.getcwd())
if os.path.isdir(dictionaryName) == 0:
os.mkdir(dictionaryName)
os.chdir(dictionaryName)
for i in range(1, pageno + 1, 1):
print('######当前第' + str(i) + '页######')
# 用BS获取对应元素
soup = BeautifulSoup(get_result(ybcode, i), 'lxml')
# print(soup)
for j in soup.select('tr'):
s = BeautifulSoup(str(j), 'lxml')
# print(s)
if len(s.select('img[src="/resources/design/images/nS_down2.png"]')) == 0:
pass
else:
try:
# 判断到年鉴下载地址所在的td
if len(BeautifulSoup(str(j), 'lxml').select('td:nth-of-type(3) > a')) >= 1:
# 获取表标题
title = str(BeautifulSoup(str(j), 'lxml').select('td:nth-of-type(1) > a')[0].get_text())
# 获取链接
url = 'http://data.cnki.net' + BeautifulSoup(str(j), 'lxml').select('td:nth-of-type(3) > a')[
1].get('href')
# 获取表的对应序号代码
code = BeautifulSoup(str(j), 'lxml').select('td:nth-of-type(3) > a')[
1].get('href').split("=")[1]
# 若不清洗数据,则文件名中会包含\n等特殊字符,导致文件下载错误
title = dataclear(title)
# 判断文件是否已存在,存在则跳过
if not os.path.isfile(dictionaryName + '/' + title + '.xls'):
# 判断文件是否是附录,由于有的附录自带二级地址,会导致报错,此处暂时不下载附录
if '附录' not in title:
# 随机等待一阵
time.sleep(random.random() * 4 + 8)
print(filedown(title, url, code))
else:
print('已存在:' + title)
except Exception as e:
print('error:-------------------' + str(e))
sys.exit(0)
os.chdir(os.path.abspath(os.path.dirname(os.getcwd())))
def count():
global count1
count1 = count1 + 1
print('=====已下载:' + str(count1) + '个')
# 文件下载函数
def filedown(title, url, code):
# 首先判断指定code的文件是否已经存在
path = os.getcwd()
for file in os.listdir(path):
if code in file:
print("文件已存在")
olddir = os.path.join(path, file)
newdir = os.path.join(path, title + '.xls')
print(olddir)
print(newdir)
os.rename(olddir, newdir)
print('重命名:' + title)
count()
return "已完成"
global browser
# 现采用selenium直接模拟点击操作
options = webdriver.ChromeOptions()
# 设置为0表示禁止弹出窗口,设置文件下载路径
# 设置默认下载目录
prefs = {'profile.default_content_settings.popups': 0,
'download.default_directory': os.getcwd()}
# 设置为无头模式,不显示浏览器
options.add_experimental_option('prefs', prefs)
options.add_argument('headless')
desired_capabilities = DesiredCapabilities.CHROME # 修改页面加载策略
desired_capabilities["pageLoadStrategy"] = "none"
# 声明浏览器对象
browser = webdriver.Chrome(desired_capabilities=desired_capabilities, options=options)
wait = WebDriverWait(browser, 10)
try:
# 发送下载请求
browser.get(url)
# 找到登录按钮
wait.until(EC.presence_of_element_located((By.ID, 'Button2')))
loginButton = browser.find_element_by_id('Button2')
# 点击登录按钮
loginButton.click()
countdown = 15
while code not in "".join(os.listdir(path)):
time.sleep(1)
countdown = countdown - 1
print("=====倒计时:" + str(countdown))
if countdown < 1:
browser.quit()
return "下载失败:超时"
else:
# 重命名下载的文件
for file in os.listdir(path):
if code in file:
olddir = os.path.join(path, file)
newdir = os.path.join(path, title + '.xls')
print(olddir)
print(newdir)
if not os.path.isfile(newdir):
os.rename(olddir, newdir)
browser.quit()
count()
return '下载完成,重命名:' + title
else:
browser.quit()
return '文件已存在'
print('不应该运行到这里')
browser.quit()
sys.exit(0)
except Exception as e:
print(e)
browser.quit()
sys.exit(0)
def spider():
# 确定年鉴名称
yearBooksName = '中国能源统计年鉴'
# 根据年鉴名称获取文件夹名
dictionaryName = os.getcwd() + '/' + yearBooksName
# 若文件夹不存在则新建
if os.path.isdir(dictionaryName) == 0:
os.mkdir(dictionaryName)
os.chdir(dictionaryName)
# 要爬取的年鉴列表,年份和对应的编号从网站审查元素获取
yearBooks = [
# {'ybcode': 'N2018070147', 'year': '2017年'},
# {'ybcode': 'N2017110016', 'year': '2016年'},
# {'ybcode': 'N2016120537', 'year': '2015年'},
# {'ybcode': 'N2015110114', 'year': '2014年'},
{'ybcode': 'N2014030143', 'year': '2013年'},
{'ybcode': 'N2013020081', 'year': '2012年'},
# {'ybcode': 'N2012020066', 'year': '2011年'},
# {'ybcode': 'N2011030123', 'year': '2010年'},
# {'ybcode': 'N2010080088', 'year': '2009年'},
# {'ybcode': 'N2009060138', 'year': '2008年'},
# {'ybcode': 'N2008070077', 'year': '2007年'},
# {'ybcode': 'N2009100078', 'year': '2006年'},
# {'ybcode': 'N2009100028', 'year': '2005年'},
# {'ybcode': 'N2006050898', 'year': '2004年'},
# {'ybcode': 'N2006050897', 'year': '2000-2002年'},
# {'ybcode': 'N2005120868', 'year': '1997-1999年'},
# {'ybcode': 'N2010040156', 'year': '1991年'},
{'ybcode': 'N2005120869', 'year': '1991-1996年'},
{'ybcode': 'N2005120761', 'year': '1989年'},
{'ybcode': 'N2006010708', 'year': '1986年'},
]
# 循环执行爬取
for yearBook in yearBooks:
filedata(yearBook, yearBooksName)
if __name__ == '__main__':
spider()
持久化运行,监测爬虫程序运行状态,如果停止了自动重新开始,直接调用cmd,用绝对路径运行爬虫,并将日志输出获取。
# -*- coding: UTF-8 -*-
#!DATE: 2018/10/9
#!@Author: yingying
#keeprunning.py
import os
import subprocess
# logging
# require python2.6.6 and later
import logging
from logging.handlers import RotatingFileHandler
## log settings: SHOULD BE CONFIGURED BY config
LOG_PATH_FILE = "D:\pyCharm\studyTool\my_service_mgr.log"
LOG_MODE = 'a'
LOG_MAX_SIZE = 10 * 1024 * 1024 # 10M per file
LOG_MAX_FILES = 10 # 10 Files: my_service_mgr.log.1, printmy_service_mgrlog.2, ...
LOG_LEVEL = logging.DEBUG
LOG_FORMAT = "%(asctime)s %(levelname)-10s[%(filename)s:%(lineno)d(%(funcName)s)] %(message)s"
handler = RotatingFileHandler(LOG_PATH_FILE, LOG_MODE, LOG_MAX_SIZE, LOG_MAX_FILES)
formatter = logging.Formatter(LOG_FORMAT)
handler.setFormatter(formatter)
Logger = logging.getLogger()
Logger.setLevel(LOG_LEVEL)
Logger.addHandler(handler)
# color output
#
pid = os.getpid()
def print_error(s):
print('\033[31m[%d: ERROR] %s\033[31;m' % (pid, s))
def print_info(s):
print( '\033[32m[%d: INFO] %s\033[32;m' % (pid, s))
def print_warning(s):
print ('\033[33m[%d: WARNING] %s\033[33;m' % (pid, s))
def start_child_proc(command, merged):
try:
if command is None:
raise (OSError, "Invalid command")
child = None
if merged is True:
# merge stdout and stderr
child = subprocess.Popen(command)
# child = subprocess.Popen(command,
# stderr=subprocess.STDOUT, # 表示子进程的标准错误也输出到标准输出
# stdout=subprocess.PIPE # 表示需要创建一个新的管道
# )
else:
# DO NOT merge stdout and stderr
child = subprocess.Popen(command)
# child = subprocess.Popen(command,
# stderr=subprocess.PIPE,
# stdout=subprocess.PIPE)
return child
except subprocess.CalledProcessError:
pass # handle errors in the called executable
except OSError:
raise (OSError, "Failed to run command!")
def run_forever(command):
print_info("start child process with command: " + ' '.join(command))
Logger.info("start child process with command: " + ' '.join(command))
merged = False
child = start_child_proc(command, merged)
failover = 0
while True:
while child.poll() != None:
failover = failover + 1
print_warning("child process shutdown with return code: " + str(child.returncode))
Logger.critical("child process shutdown with return code: " + str(child.returncode))
print_warning("restart child process again, times=%d" % failover)
Logger.info("restart child process again, times=%d" % failover)
child = start_child_proc(command, merged)
# read child process stdout and log it
out, err = child.communicate()
returncode = child.returncode
if returncode != 0:
for errorline in err.slitlines():
Logger.info(errorline)
else:
Logger.info("execute child process failed")
Logger.exception("!!!should never run to this!!!")
if __name__ == "__main__":
cmd = 'py D:\pyCharm\studyTool\cnkiCrawler.py'
run_forever(cmd)