python爬虫实战:爬取http://cffex.com.cn/网站的期货持仓信息,存入mysql库中

需要爬取http://cffex.com.cn/ccpm/?productid=IF页面上的IF、IC、IH、TS、TF、T的持仓信息,时间为[2016.1.1到2020.1.1)。
首先查看robots.txt,该网站没有robots.txt文件。
查看网页源码,发现查询选取日期种类部分使用了

多次写代码,问题主要有:
1.日期部分逻辑小问题;
2.每个页面耗时渐渐增加,爬到一半甚至一个页面需要9s左右,从爬取完到存入库;
3.xml文件中的标签名大小写不一致;标签内容有空格等。

问题1经过几次实验改进。问题2,主要是查重,当表中数据多起来,查重十分费时,而且有时查表太久还会断开连接。问题3,html解析时不区分大小写,入库前对数据去空格。

为了解决问题2,参考了https://www.cnblogs.com/gl1573/p/10129382.html中的超时重试,但最后决定还是先把xml文件存到本地,然后慢慢存入库,对网站的压力小,也不容易中断;另外,由于查重太费时,xml数据中应该没有重复的项目,所以后面改为了直接存入。

爬取内容代码:

import requests
from requests.adapters import HTTPAdapter
from bs4 import BeautifulSoup
import time
import random
import pymysql
import re


def leapYear(year):
    # 如果year是闰年则返回True
    if (year % 400 == 0) or (year % 4 == 0 and year % 100 != 0):
        return True
    return False


def nextDate(date):
    # 根据参数date给出下一个日期。date为int列表,格式为年月日星期,如[2020,9,16,3]
    y, m, d, w = date
    w = (w + 1) % 7
    if w == 0:
        w = 7
    d += 1
    if d == 32:
        d = 1
        m += 1
        if m == 13:
            m = 1
            y += 1
    elif d == 31:
        if m in {4, 6, 9, 11}:
            d = 1
            m += 1
    elif d == 30 and m == 2:
        d = 1
        m = 3
    elif d == 29 and m == 2:
        if not leapYear(y):
            d = 1
            m = 3
    return [y, m, d, w]
    # 1,3,5,7,8,10,12   31
    # 4,6,9,11          30
    # 2                 28/29


def checkDate(date):
    #如果没有 星期,则根据 标准日期 找出星期
    if len(date) == 3:
        standard = [1970, 1, 1, 4]
        y, m, d, w = standard
        yd, md, dd = date
        while y != yd or m != md or d != dd:
            standard = nextDate(standard)
            y, m, d, w = standard
        return standard
    return date


def getInput():
    # 接受两个表示日期的列表,返回开始日期,结束日期,日期之差(需要爬取的天数),需要的种类
    print('请输入开始日期和结束日期,顺序随便。爬取数据包括开始日期但不包括结束日期。\n日期的格式为 年.月.日')
    s = input('开始日期:')
    e = input('结束日期:')
    ks = input('请输入需要的种类,如IF.IC.IH.TS.TF.T,大小写随意。如果需要多个,用.隔开:').upper()
    s = s.split('.')
    e = e.split('.')
    kinds = ks.split('.')
    s = checkDate([int(s[0]), int(s[1]), int(s[2])])
    e = checkDate([int(e[0]), int(e[1]), int(e[2])])
    if s[0] > e[0]:
        s, e = e, s
    elif (s[0] == e[0]) and (s[1] > e[1]):
        s, e = e, s
    elif (s[0] == e[0]) and (s[1] == e[1]) and (s[2] > e[2]):
        s, e = e, s
    start_date = s
    end_date = e
    total_days = 1
    while (s[0] != e[0]) or (s[1] != e[1]) or (s[2] != e[2]):
        s = nextDate(s)
        total_days += 1
    return [start_date, end_date, total_days, kinds]


def getURLandDate(start_date, kind):
    # 根据start_date和kind构建出对应的url和date(date+kind用于文件命名)
    y, m, d, w = start_date
    url = "http://cffex.com.cn/sj/ccpm/{}{:02}/{:02}/{}.xml?id={}".format(
        y, m, d, kind, random.randint(1, 99))
    date = '{}{:02}{:02}{}'.format(y, m, d, kind)
    return [url, date]


if __name__ == '__main__':
    try:
        start_time = time.time()
        start_date, end_date, total_days, kinds = getInput()
        # 每个页面采集大概需要1s
        pages = total_days * len(kinds) * 5 / 7
        print('约{:.0f}个页面,预计需要{:.0f}秒,约{:.0f}分钟 / {:.1f}小时...'.format(
            pages, pages * 1.65, pages * 1.65 / 60, pages * 1.65 / 3600))
        del pages
        if end_date[3] == 6:
            end_date = nextDate(end_date)
        if end_date[3] == 7:
            end_date = nextDate(end_date)
        s_d = start_date
        e_d = end_date
        p = 0
        # 设置重连次数。包括开头,若出现异常,最多尝试连接4次
        session = requests.Session()
        session.mount('http://', HTTPAdapter(max_retries=3))
        session.mount('https://', HTTPAdapter(max_retries=3))
        for kind in kinds:
            start_date = s_d
            end_date = e_d
            while start_date != end_date:
                if start_date[3] == 6 or start_date[3] == 7:
                    start_date = nextDate(start_date)
                    p += 1
                    continue
                url, date = getURLandDate(start_date, kind)
                xml = session.get(url, timeout=30)
                bsObj = BeautifulSoup(xml.text, 'lxml')
                if bsObj.find('title', text='网页错误'):
                    start_date = nextDate(start_date)
                    p += 1
                    continue
                xml.encoding = xml.apparent_encoding
                with open('D:\\cffex_spider\\' + date + ".txt", "w") as f:
                    f.write(xml.text)
                    f.close()
                print('\r已用时:{:.0f}分钟,进展:约{:.2%},正在处理:{}'.format(
                    (time.time() - start_time) / 60,
                    float(p) / total_days / len(kinds), url),
                      end='')
                start_date = nextDate(start_date)
                time.sleep(1)
                p += 1
    #except:
    #print('出错了...')
    finally:
        end_time = time.time()
        t = end_time - start_time
        print('完成,共花费 %d 秒,约%d 分钟 / %f 小时' % (t, t / 60, t / 3600))

存入库中代码:

from bs4 import BeautifulSoup
import time
import pymysql
import re
import logging


def leapYear(year):
    # 如果year是闰年则返回True
    if (year % 400 == 0) or (year % 4 == 0 and year % 100 != 0):
        return True
    return False


def nextDate(date):
    # 根据参数date给出下一个日期。date为int列表,格式为年月日星期,如[2020,9,16,3]
    y, m, d, w = date
    w = (w + 1) % 7
    if w == 0:
        w = 7
    d += 1
    if d == 32:
        d = 1
        m += 1
        if m == 13:
            m = 1
            y += 1
    elif d == 31:
        if m in {4, 6, 9, 11}:
            d = 1
            m += 1
    elif d == 30 and m == 2:
        d = 1
        m = 3
    elif d == 29 and m == 2:
        if not leapYear(y):
            d = 1
            m = 3
    return [y, m, d, w]
    # 1,3,5,7,8,10,12   31
    # 4,6,9,11          30
    # 2                 28/29


def checkDate(date):
    #如果没有 星期,则根据 标准日期 找出星期
    if len(date) == 3:
        standard = [1970, 1, 1, 4]
        y, m, d, w = standard
        yd, md, dd = date
        while y != yd or m != md or d != dd:
            standard = nextDate(standard)
            y, m, d, w = standard
        return standard
    return date


def getInput():
    # 接受两个表示日期的列表,返回开始日期,结束日期,日期之差(需要爬取的天数),需要的种类
    print('请输入开始日期和结束日期,顺序随便。爬取数据包括开始日期但不包括结束日期。\n日期的格式为 年.月.日')
    s = input('开始日期:')
    e = input('结束日期:')
    ks = input('请输入需要的种类,如IF.IC.IH.TS.TF.T,大小写随意。如果需要多个,用.隔开:').upper()
    s = s.split('.')
    e = e.split('.')
    kinds = ks.split('.')
    s = checkDate([int(s[0]), int(s[1]), int(s[2])])
    e = checkDate([int(e[0]), int(e[1]), int(e[2])])
    if s[0] > e[0]:
        s, e = e, s
    elif (s[0] == e[0]) and (s[1] > e[1]):
        s, e = e, s
    elif (s[0] == e[0]) and (s[1] == e[1]) and (s[2] > e[2]):
        s, e = e, s
    start_date = s
    end_date = e
    total_days = 1
    while (s[0] != e[0]) or (s[1] != e[1]) or (s[2] != e[2]):
        s = nextDate(s)
        total_days += 1
    return [start_date, end_date, total_days, kinds]


if __name__ == '__main__':
    try:
        start_time = time.time()
        logging.basicConfig(level=logging.INFO, filename='mylog.txt')
        conn = pymysql.connect(host='127.0.0.1',
                               user='root',
                               passwd='aboutime',
                               db='mysql',
                               charset='utf8')
        cur = conn.cursor()
        cur.execute('USE futures;')
        start_date, end_date, total_days, kinds = getInput()
        pages = total_days * len(kinds) * 5 / 7
        print('约{:.0f}个页面,预计需要{:.0f}秒,约{:.0f}分钟 / {:.1f}小时...'.format(
            pages, pages, pages / 60, pages / 3600))
        del pages
        if end_date[3] == 6:
            end_date = nextDate(end_date)
        if end_date[3] == 7:
            end_date = nextDate(end_date)
        s_d = start_date
        e_d = end_date
        p = 0
        # 设置重连次数。包括开头,若出现异常,最多尝试连接4次
        for kind in kinds:
            start_date = s_d
            end_date = e_d
            while start_date != end_date:
                if start_date[3] == 6 or start_date[3] == 7:
                    start_date = nextDate(start_date)
                    p += 1
                    continue
                y, m, d, w = start_date
                date = '{}{:02}{:02}{}'.format(y, m, d, kind)
                with open('D:\\cffex_spider\\' + date + ".txt", "rb") as f:
                    xml = f.read()
                    f.close()
                bsObj = BeautifulSoup(xml, 'lxml')
                if bsObj.find('title', text='网页错误'):
                    logging.info('{}上出现网页错误,跳过。'.format(date))
                    start_date = nextDate(start_date)
                    p += 1
                    continue
                tiktok = time.time()
                for dataitem in bsObj.find_all('data'):
                    instrumentid = dataitem.find('instrumentid').get_text()
                    tradingday = dataitem.find('tradingday').get_text()
                    datatypeid = dataitem.find('datatypeid').get_text()
                    paiming = dataitem.find('rank').get_text()
                    shortname = dataitem.find('shortname').get_text()
                    volume = dataitem.find('volume').get_text()
                    varvolume = dataitem.find('varvolume').get_text()
                    partyid = dataitem.find('partyid').get_text()

                    tradingday = tradingday[0:4] + '-' + tradingday[
                        4:6] + '-' + tradingday[6:8]
                    datatypeid = int(datatypeid)
                    paiming = int(paiming)
                    volume = int(volume)
                    varvolume = int(varvolume)
                    partyid = int(partyid)
                    instrumentid = instrumentid.strip()
                    shortname = shortname.strip()
                    tiktok = time.time()
                    #cur.execute('SELECT partyid FROM cffex_t WHERE instrumentid LIKE "%s" AND tradingday = "%s" AND datatypeid = %d AND paiming = %d;' % (instrumentid, tradingday, datatypeid, paiming))
                    cur.execute(
                        'INSERT INTO cffex_t (instrumentid, tradingday, datatypeid, paiming, shortname, volume, varvolume, partyid) VALUES ("%s", "%s", %d, %d, "%s", %d, %d, %d);'
                        % (instrumentid, tradingday, datatypeid, paiming,
                           shortname, volume, varvolume, partyid))
                    conn.commit()
                logging.info('写入{}的数据。'.format(date))
                print('\r已用时:{:.0f}分钟,进展:约{:.2%},正在处理:{}。处理一页用时{:.2f}s'.format(
                    (time.time() - start_time) / 60,
                    float(p) / total_days / len(kinds), date,
                    time.time() - tiktok),
                      end='')
                start_date = nextDate(start_date)
                p += 1
    #except:
    #print('出错了...')
    finally:
        end_time = time.time()
        t = end_time - start_time
        print('完成,共花费 %d 秒,约%d 分钟 / %f 小时' % (t, t / 60, t / 3600))
        cur.close()
        conn.close()

mysql中创建表cffex_t:

CREATE TABLE cffex_t (no INT UNSIGNED NOT NULL AUTO_INCREMENT, instrumentid VARCHAR(7) NOT NULL, tradingday DATE NOT NULL, datatypeid TINYINT NOT NULL, paiming TINYINT NOT NULL, shortname VARCHAR(5) NOT NULL, volume MEDIUMINT NOT NULL, varvolume MEDIUMINT NOT NULL, partyid SMALLINT NOT NULL, created TIMESTAMP DEFAULT CURRENT_TIMESTAMP, PRIMARY KEY (no));

另外,爬取一个网页大约0.65s,为了减轻网站压力,爬取间隔1s。

你可能感兴趣的:(mysql,python,spider,python,mysql)