爬虫-浦发银行-信用数据抓取(2018-11-19)

文章目录

    • 爬虫地址
    • 爬虫技术
    • 爬虫代码
    • 致谢

爬虫地址

http://per.spdb.com.cn/professional_investment_research/preferential_merchants/

爬虫技术

参照其他几篇文章:
爬虫-中国银行卡-优惠商户活动数据(2018-11-15)
爬虫-新浪财经-信用卡优惠商店数据(2018-11-15)

爬虫代码

# -*-coding:utf-8-*-
import os

import requests
import xlrd
import xlwt
from lxml import etree
from xlutils.copy import copy


def get_page(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response
    except Exception:
        return None


def parse_page(page):
    all_contexnt = page.xpath('//div[@class="content_text_l fl"]/ul/li/p/text()')
    # print(all_contexnt)
    for i in range(len(all_contexnt)):
        time = str(all_contexnt[0]).replace('\xa0', '')
        target = all_contexnt[1]
        content = all_contexnt[2]
        try:
            rule = all_contexnt[3]
        except Exception:
            rule = ''
        try:
            address = all_contexnt[4]
        except Exception:
            address = ''
        try:
            telephone = all_contexnt[5]
        except Exception:
            telephone = ''
        try:
            introduction = all_contexnt[6]
        except Exception:
            introduction = ''
        info_list = [time, target, content, rule, address, telephone, introduction]
        print(info_list)
        return info_list


def write_data(sheet, row, lst):
    for data_infos in lst:
        j = 0
        for data in data_infos:
            sheet.write(row, j, data)
            j += 1
        row += 1


def save(file_name, data):
    if os.path.exists(file_name):
        # 打开excel
        rb = xlrd.open_workbook(file_name, formatting_info=True)
        # 用 xlrd 提供的方法获得现在已有的行数
        rn = rb.sheets()[0].nrows
        # 复制excel
        wb = copy(rb)
        # 从复制的excel文件中得到第一个sheet
        sheet = wb.get_sheet(0)
        # 向sheet中写入文件
        write_data(sheet, rn, data)
        # 删除原先的文件
        os.remove(file_name)
        # 保存
        wb.save(file_name)
    else:
        header = ['blank_netnetname', 'blank_netaddress', 'blank_netphonenum', 'blank_activitycontent',
                  'activity_deadtime']
        book = xlwt.Workbook(encoding='utf-8')
        sheet = book.add_sheet('浦发银行-信用卡数据')
        # 向 excel 中写入表头
        for h in range(len(header)):
            sheet.write(0, h, header[h])
        # 向sheet中写入内容
        write_data(sheet, 1, data)
        book.save(file_name)


def main():
    print('*' * 80)
    print('\t\t\t\t浦发银行-信用卡数据下载,下载地址为:http://creditcard.pingan.com/cms-tmplt/searchShangHuList.do')
    print('作者:谢华东  2018.11.19')
    print('--------------')
    path = (input('请输入要保存的地址(例如:C:\\Users\\xhdong1\\Desktop\\),不输入则保存到当前地址:\n'))

    file_name = path + '浦发银行-信用卡数据.xls'
    print(file_name)

    base_url = 'http://per.spdb.com.cn/professional_investment_research/preferential_merchants/{catolog}/index_{num_page}.shtml'
    all_nedd_scrapy_url = ['dnxb', 'sqshq', 'yxjnh']
    # 计算总共有多少页
    for catolog in all_nedd_scrapy_url:
        # print(catolog)
        for i in range(0, 3):
            if i == 0:
                url = 'http://per.spdb.com.cn/professional_investment_research/preferential_merchants/{catolog}/index.shtml'.format(
                    catolog=catolog)
            else:
                url = base_url.format(catolog=catolog, num_page=i)
            # print(url)
            response = get_page(url)
            print('该' + url + '下没有数据')
            if response == None:
                continue
            else:
                page = response.content.decode('utf-8')
                # print(page)
                html = etree.HTML(page)
                all_detail_url = html.xpath('//ul[@class="ul_list"]/li/a/@href')
                # print(all_detail_url)
                # print('------------')
                all_info_list = []
                for url in all_detail_url:
                    detail_url = 'http://per.spdb.com.cn/professional_investment_research/preferential_merchants/' + catolog + url[
                                                                                                                               1:]
                    print(detail_url)
                    detail_page = etree.HTML(get_page(detail_url).content.decode('utf-8'))
                    info_list = parse_page(detail_page)
                    all_info_list.append(info_list)
                    save(file_name, all_info_list)


if __name__ == '__main__':
    main()

致谢

感谢这美好的生活。时光很长,代码要慢慢写。

你可能感兴趣的:(python爬虫)