Python 爬取zw年鉴

爬虫主方法

# -*- coding: utf-8 -*-
import sys
import urllib.request
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver import DesiredCapabilities
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import requests
import time
import random
import re
import os
import threading
import chardet

count1 = 0
lock = threading.Lock()


# 用requests方法发送请求,获取年鉴目录数据
def get_result(ybcode, page=1):
    try:
        # 设定params
        data = {'ybcode': ybcode, 'entrycode': '', 'page': page, 'pagerow': '20',
                'Referer': 'http://data.cnki.net/Yearbook'}
        # 设定请求头
        headers = {
            'Content-Type': 'application/x-www-form-urlencoded',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
            # Cookie或许需要不时更新
            'Cookie': 'Ecp_ClientId=2201106155502682665; Ecp_LoginStuts={"IsAutoLogin":false,"UserName":"sh0292",'
                      '"ShowName":"%e4%b8%ad%e5%9b%bd%e7%9f%bf%e4%b8%9a%e5%a4%a7%e5%ad%a6%e5%9b%be%e4%b9%a6%e9%a6%86",'
                      '"UserType":"bk","BUserName":"","BShowName":"","BUserType":"","r":"R7eKrF"}; '
                      'c_m_LinID=LinID=WEEvREcwSlJHSldSdmVqeVpQWEhjK2JqNWVTcFpPTFJSTFVLMnUxWGMyQT0'
                      '=$9A4hF_YAuvQ5obgVAqNKPCYcEjKensW4IQMovwHtwkF4VYPoHbKxJw!!&ot=11/06/2020 17:39:14; '
                      'LID=WEEvREcwSlJHSldSdmVqeVpQWEhjK2JqNWVTcFpPTFJSTFVLMnUxWGMyQT0'
                      '=$9A4hF_YAuvQ5obgVAqNKPCYcEjKensW4IQMovwHtwkF4VYPoHbKxJw!!; c_m_expire=2020-11-06 17:39:14; '
                      'Hm_lvt_911066eb2f53848f7d902db7bb8ac4d7=1604650989,1604651116,1604651168,1604654428; '
                      'ASP.NET_SessionId=3d0xpwff2pt0exxcclmw3we4; SID=009023; '
                      'Hm_lpvt_911066eb2f53848f7d902db7bb8ac4d7=1604654428'
            ,
            'Referer': 'https://login.cnki.net/login/?platform=kns&ForceReLogin=1&ReturnURL=https://www.cnki.net/',
        }
        # 访问查询年鉴目录的地址
        url = "https://data.cnki.net/Yearbook/PartialGetCatalogResult"
        # 对params进行编码
        params = urllib.parse.urlencode(data).encode(encoding='utf-8')
        # 封装请求
        req = urllib.request.Request(url, params, headers)
        # 发送请求并接收结果
        r = urllib.request.urlopen(req)
        # 读取结果
        res = str(r.read(), 'utf-8')
        # print(res)
        return res
    # 此处接收异常直接退出,方便监视程序重新调起爬虫
    except Exception as e:
        sys.exit(0)


# 获取总页数
def get_pageno(ybcode):
    soup = BeautifulSoup(get_result(ybcode), 'lxml')
    pages = int(soup.select('.s_p_listl')[0].get_text().split("共")[2].split('页')[0])
    print('总共' + str(pages) + '页')
    return pages


# 数据的清理,除去文本中所有的\n和\r
def dataclear(data):
    data = re.sub('\n+', '', data)
    data = re.sub('\r+', '', data)
    data = re.sub(' +', '', data)
    data = re.sub('>+', '-', data)
    return data


# 下载知网的统计年鉴之类的所有excel表
def filedata(yearBook, yearBookName):
    ybcode = yearBook.get('ybcode')
    dictionaryName = os.getcwd() + '/' + yearBook.get('year') + yearBookName
    dictionaryName = dataclear(dictionaryName)
    pageno = get_pageno(ybcode)
    print(os.getcwd())
    if os.path.isdir(dictionaryName) == 0:
        os.mkdir(dictionaryName)
    os.chdir(dictionaryName)
    for i in range(1, pageno + 1, 1):
        print('######当前第' + str(i) + '页######')
        # 用BS获取对应元素
        soup = BeautifulSoup(get_result(ybcode, i), 'lxml')
        # print(soup)
        for j in soup.select('tr'):
            s = BeautifulSoup(str(j), 'lxml')
            # print(s)
            if len(s.select('img[src="/resources/design/images/nS_down2.png"]')) == 0:
                pass
            else:
                try:
                    # 判断到年鉴下载地址所在的td
                    if len(BeautifulSoup(str(j), 'lxml').select('td:nth-of-type(3) > a')) >= 1:
                        # 获取表标题
                        title = str(BeautifulSoup(str(j), 'lxml').select('td:nth-of-type(1) > a')[0].get_text())
                        # 获取链接
                        url = 'http://data.cnki.net' + BeautifulSoup(str(j), 'lxml').select('td:nth-of-type(3) > a')[
                            1].get('href')
                        # 获取表的对应序号代码
                        code = BeautifulSoup(str(j), 'lxml').select('td:nth-of-type(3) > a')[
                            1].get('href').split("=")[1]
                        # 若不清洗数据,则文件名中会包含\n等特殊字符,导致文件下载错误
                        title = dataclear(title)
                        # 判断文件是否已存在,存在则跳过
                        if not os.path.isfile(dictionaryName + '/' + title + '.xls'):
                            # 判断文件是否是附录,由于有的附录自带二级地址,会导致报错,此处暂时不下载附录
                            if '附录' not in title:
                                # 随机等待一阵
                                time.sleep(random.random() * 4 + 8)
                                print(filedown(title, url, code))
                        else:
                            print('已存在:' + title)
                except Exception as e:
                    print('error:-------------------' + str(e))
                    sys.exit(0)
    os.chdir(os.path.abspath(os.path.dirname(os.getcwd())))


def count():
    global count1
    count1 = count1 + 1
    print('=====已下载:' + str(count1) + '个')


# 文件下载函数
def filedown(title, url, code):
    # 首先判断指定code的文件是否已经存在
    path = os.getcwd()
    for file in os.listdir(path):
        if code in file:
            print("文件已存在")
            olddir = os.path.join(path, file)
            newdir = os.path.join(path, title + '.xls')
            print(olddir)
            print(newdir)
            os.rename(olddir, newdir)
            print('重命名:' + title)
            count()
            return "已完成"

    global browser
    # 现采用selenium直接模拟点击操作
    options = webdriver.ChromeOptions()
    # 设置为0表示禁止弹出窗口,设置文件下载路径
    # 设置默认下载目录
    prefs = {'profile.default_content_settings.popups': 0,
             'download.default_directory': os.getcwd()}
    # 设置为无头模式,不显示浏览器
    options.add_experimental_option('prefs', prefs)
    options.add_argument('headless')
    desired_capabilities = DesiredCapabilities.CHROME  # 修改页面加载策略
    desired_capabilities["pageLoadStrategy"] = "none"
    # 声明浏览器对象
    browser = webdriver.Chrome(desired_capabilities=desired_capabilities, options=options)
    wait = WebDriverWait(browser, 10)
    try:
        # 发送下载请求
        browser.get(url)
        # 找到登录按钮
        wait.until(EC.presence_of_element_located((By.ID, 'Button2')))
        loginButton = browser.find_element_by_id('Button2')
        # 点击登录按钮
        loginButton.click()
        countdown = 15
        while code not in "".join(os.listdir(path)):
            time.sleep(1)
            countdown = countdown - 1
            print("=====倒计时:" + str(countdown))
            if countdown < 1:
                browser.quit()
                return "下载失败:超时"
        else:
            # 重命名下载的文件
            for file in os.listdir(path):
                if code in file:
                    olddir = os.path.join(path, file)
                    newdir = os.path.join(path, title + '.xls')
                    print(olddir)
                    print(newdir)
                    if not os.path.isfile(newdir):
                        os.rename(olddir, newdir)
                        browser.quit()
                        count()
                        return '下载完成,重命名:' + title
                    else:
                        browser.quit()
                        return '文件已存在'
            print('不应该运行到这里')
            browser.quit()
            sys.exit(0)
    except Exception as e:
        print(e)
        browser.quit()
        sys.exit(0)


def spider():
    # 确定年鉴名称
    yearBooksName = '中国能源统计年鉴'
    # 根据年鉴名称获取文件夹名
    dictionaryName = os.getcwd() + '/' + yearBooksName
    # 若文件夹不存在则新建
    if os.path.isdir(dictionaryName) == 0:
        os.mkdir(dictionaryName)
    os.chdir(dictionaryName)
    # 要爬取的年鉴列表,年份和对应的编号从网站审查元素获取
    yearBooks = [

        # {'ybcode': 'N2018070147', 'year': '2017年'},
        # {'ybcode': 'N2017110016', 'year': '2016年'},
        # {'ybcode': 'N2016120537', 'year': '2015年'},
        # {'ybcode': 'N2015110114', 'year': '2014年'},
        {'ybcode': 'N2014030143', 'year': '2013年'},
        {'ybcode': 'N2013020081', 'year': '2012年'},
        # {'ybcode': 'N2012020066', 'year': '2011年'},
        # {'ybcode': 'N2011030123', 'year': '2010年'},
        # {'ybcode': 'N2010080088', 'year': '2009年'},
        # {'ybcode': 'N2009060138', 'year': '2008年'},
        # {'ybcode': 'N2008070077', 'year': '2007年'},
        # {'ybcode': 'N2009100078', 'year': '2006年'},
        # {'ybcode': 'N2009100028', 'year': '2005年'},
        # {'ybcode': 'N2006050898', 'year': '2004年'},
        # {'ybcode': 'N2006050897', 'year': '2000-2002年'},
        # {'ybcode': 'N2005120868', 'year': '1997-1999年'},
        # {'ybcode': 'N2010040156', 'year': '1991年'},
        {'ybcode': 'N2005120869', 'year': '1991-1996年'},
        {'ybcode': 'N2005120761', 'year': '1989年'},
        {'ybcode': 'N2006010708', 'year': '1986年'},
    ]
    # 循环执行爬取
    for yearBook in yearBooks:
        filedata(yearBook, yearBooksName)


if __name__ == '__main__':
    spider()

持久化运行,监测爬虫程序运行状态,如果停止了自动重新开始,直接调用cmd,用绝对路径运行爬虫,并将日志输出获取。

# -*- coding: UTF-8 -*-
#!DATE: 2018/10/9
#!@Author: yingying
#keeprunning.py
import os
import subprocess

# logging
# require python2.6.6 and later
import logging
from logging.handlers import RotatingFileHandler

## log settings: SHOULD BE CONFIGURED BY config
LOG_PATH_FILE = "D:\pyCharm\studyTool\my_service_mgr.log"
LOG_MODE = 'a'
LOG_MAX_SIZE = 10 * 1024 * 1024  # 10M per file
LOG_MAX_FILES = 10  # 10 Files: my_service_mgr.log.1, printmy_service_mgrlog.2, ...
LOG_LEVEL = logging.DEBUG

LOG_FORMAT = "%(asctime)s %(levelname)-10s[%(filename)s:%(lineno)d(%(funcName)s)] %(message)s"

handler = RotatingFileHandler(LOG_PATH_FILE, LOG_MODE, LOG_MAX_SIZE, LOG_MAX_FILES)
formatter = logging.Formatter(LOG_FORMAT)
handler.setFormatter(formatter)

Logger = logging.getLogger()
Logger.setLevel(LOG_LEVEL)
Logger.addHandler(handler)

# color output
#
pid = os.getpid()


def print_error(s):
    print('\033[31m[%d: ERROR] %s\033[31;m' % (pid, s))


def print_info(s):
    print( '\033[32m[%d: INFO] %s\033[32;m' % (pid, s))


def print_warning(s):
    print ('\033[33m[%d: WARNING] %s\033[33;m' % (pid, s))


def start_child_proc(command, merged):
    try:
        if command is None:
            raise (OSError, "Invalid command")

        child = None
        if merged is True:
            # merge stdout and stderr
            child = subprocess.Popen(command)
            # child = subprocess.Popen(command,
            #                          stderr=subprocess.STDOUT,  # 表示子进程的标准错误也输出到标准输出
            #                          stdout=subprocess.PIPE  # 表示需要创建一个新的管道
            #                          )
        else:
            # DO NOT merge stdout and stderr
            child = subprocess.Popen(command)
            # child = subprocess.Popen(command,
            #                          stderr=subprocess.PIPE,
            #                          stdout=subprocess.PIPE)
        return child
    except subprocess.CalledProcessError:
        pass  # handle errors in the called executable
    except OSError:
        raise (OSError, "Failed to run command!")


def run_forever(command):
    print_info("start child process with command: " + ' '.join(command))
    Logger.info("start child process with command: " + ' '.join(command))

    merged = False
    child = start_child_proc(command, merged)

    failover = 0

    while True:
        while child.poll() != None:
            failover = failover + 1
            print_warning("child process shutdown with return code: " + str(child.returncode))
            Logger.critical("child process shutdown with return code: " + str(child.returncode))

            print_warning("restart child process again, times=%d" % failover)
            Logger.info("restart child process again, times=%d" % failover)
            child = start_child_proc(command, merged)

        # read child process stdout and log it
        out, err = child.communicate()
        returncode = child.returncode
        if returncode != 0:
            for errorline in err.slitlines():
                Logger.info(errorline)
        else:
            Logger.info("execute child process failed")

    Logger.exception("!!!should never run to this!!!")


if __name__ == "__main__":
    cmd = 'py D:\pyCharm\studyTool\cnkiCrawler.py'
    run_forever(cmd)

你可能感兴趣的:(Python 爬取zw年鉴)