http://creditcard.ccb.com/cn/creditcard/creditFavarite.html#card_province=1020&card_city=196&startNum=1&endNum=10
python3.6
技术使用参照另外 两篇文章:
爬虫-中国银行卡-优惠商户活动数据(2018-11-15)
爬虫-新浪财经-信用卡优惠商店数据(2018-11-15)
# -*-coding:utf-8-*-
import json
import os
import sys
import requests
import xlrd
import xlwt
from xlutils.copy import copy
def get_page(url):
try:
response = requests.get(url)
if response.status_code == 200:
return response
except:
return None
def write_data(sheet, row, lst):
for data_infos in lst:
j = 0
for data in data_infos:
sheet.write(row, j, data)
j += 1
row += 1
def save(file_name, data):
if os.path.exists(file_name):
# 打开excel
rb = xlrd.open_workbook(file_name, formatting_info=True)
# 用 xlrd 提供的方法获得现在已有的行数
rn = rb.sheets()[0].nrows
# 复制excel
wb = copy(rb)
# 从复制的excel文件中得到第一个sheet
sheet = wb.get_sheet(0)
# 向sheet中写入文件
write_data(sheet, rn, data)
# 删除原先的文件
os.remove(file_name)
# 保存
wb.save(file_name)
else:
header = ['biz_id', 'biz_name', 'cate_id', 'cate_name', 'catechild_id', 'catechild_name', 'province_id',
'province,city_id', 'city,biz_addr', 'biz_desc', 'start_level', 'life_id', 'life', 'biz_phone',
'biz_cmsg', 'url']
book = xlwt.Workbook(encoding='utf-8')
sheet = book.add_sheet('建设银行卡-优惠商户活动数据')
# 向 excel 中写入表头
for h in range(len(header)):
sheet.write(0, h, header[h])
# 向sheet中写入内容
write_data(sheet, 1, data)
book.save(file_name)
def main():
base_url = 'http://creditcard.ccb.com/cn/creditcard/creditFavarite.html#card_province={prov_code}&card_city={city_code}'
cities_js = 'http://creditcard.ccb.com/cn/creditcard/v3/js/citys.js'
citis_resp = get_page(cities_js)
citis_resp = citis_resp.content.decode('utf-8').split('=')[1]
# print(citis_resp)
citis_json = json.loads(citis_resp)
# print(citis_json)
path = (input('请输入要保存的地址(例如:C:\\Users\\xhdong1\\Desktop\\),不输入则保存到当前地址:\n'))
file_name = path + '建设银行卡-优惠商户活动数据.xls'
print(file_name)
province_total = len(citis_json)
for dictinct in citis_json:
# print(dictinct)
prov_code = dictinct['prov_code']
prov_name = dictinct['prov_name']
# print(prov_code)
# print(prov_name)
cities = dictinct['citys']
city_total = len(cities)
for city in cities:
all_company_info_list = []
city_code = city['city_code']
city_name = city['city_name']
print('总过有【{province_total}】个城市,现在正在爬取【{prov_name}】,该城市一共有【{city_total}】个城市,现在正在爬虫【{city}】的数据'.format(
province_total=province_total, prov_name=prov_name, city_total=city_total, city=city_name))
# print(city_code)
# print(city_name)
url = 'http://creditcard.ccb.com/webtran/get_crd_info.gsp?table_type=2&card_province={prov_code}&card_city={city_code}&startNum=1&endNum=1000000'.format(
prov_code=prov_code, city_code=city_code)
# print(_url)
companies_list = get_page(url)
companies_list = companies_list.content.decode('utf-8')
# print(companies_list)
try:
companies_json = json.loads(companies_list)
except:
continue
if companies_json:
# print(companies_json)
companies = companies_json['obj']
else:
continue
# print(companies)
for company in companies:
biz_id = company.get('biz_id')
biz_name = company.get('biz_name')
cate_id = company.get('cate_id')
cate_name = company.get('cate_name')
catechild_id = company.get('catechild_id')
catechild_name = company.get('catechild_name')
province_id = company.get('province_id')
province = company.get('province')
city_id = company.get('city_id')
city = company.get('city')
biz_addr = company.get('biz_addr')
biz_desc = company.get('biz_desc')
start_level = company.get('start_level')
life_id = company.get('life_id')
life = company.get('life')
biz_phone = company.get('biz_phone')
biz_cmsg = company.get('biz_cmsg')
url = 'http://creditcard.ccb.com/cn/creditcard/favorable/' + biz_id + '.html'
company_info = [biz_id, biz_name, cate_id, cate_name, catechild_id, catechild_name, province_id,
province, city_id, city, biz_addr, biz_desc, start_level, life_id, life, biz_phone,
biz_cmsg, url]
# for info in company_info:
# print(info)
all_company_info_list.append(company_info)
# print('--------------')
save(file_name, all_company_info_list)
print('爬完')
if __name__ == '__main__':
main()
感谢生活。