最近都在写java,但是呢我的博客似乎写了很多爬虫的文章。昨天,一个朋友发了需求给我,帮她改改代码,于是我就自己改了一下。以后还是多写一写数据平台研发的文章。
需求如下:
# coding:utf-8
import json
import os
import time
from copy import copy
import xlrd as xlrd
import xlwt as xlwt
from lxml import etree
import requests
import sys
from xlutils.copy import copy
class ZhaotoubiaopingtaiSpider():
url = 'http://bulletin.cebpubservice.com/xxfbcmses/search/bulletin.html'
wordkey = '银行'
startcheckDate='2019-05-01'
endcheckDate='2019-08-07'
params={
'searchDate': '1994-08-06',
'dates':'300',
'word': wordkey,
'categoryId': 88,
'startcheckDate': startcheckDate,
'endcheckDate': endcheckDate,
'industryName':'',
'area':''
}
headers = {
'Cookie': 'JSESSIONID=86C51A1A0546E2DA84226C777B97326E; acw_tc=7b39758715651005855884465e5f705fb7b91c58119618b6147c26f348c7ee; Hm_lvt_da6ebc493961b944c4bf10a22517a198=1565100586,1565102272; Hm_lpvt_da6ebc493961b944c4bf10a22517a198=1565102918',
'Host': 'bulletin.cebpubservice.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Whttp://bulletin.cebpubservice.com/xxfbcmses/search/bulletin.htmlOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'
}
def get_page(self, url, headers, params):
try:
response = requests.get(url, headers=headers, params=params)
if response.status_code == 200:
html = response.content.decode('utf-8', 'ignore').replace(u'\xa9', u'')
#print(html)
return html
else:
print(response.status_code)
except requests.ConnectionError:
return None
def parse_page(self, html):
table_list = html.xpath('//table[@class="table_text"]/tr')
#print(table_list)
all_info = []
for table in table_list:
td_list=table.xpath("td//text()")
new_td_list = []
if(len(td_list)>3):
#print(td_list)
for str in td_list:
if(str.isspace() or str is None or str ==''):
#print(" is none")
pass
else:
new_td_list.append(str.replace('\t','').replace('\r\n',''))
print(new_td_list)
title = new_td_list[0]
hangye = new_td_list[1]
diqu = new_td_list[2]
qudao = new_td_list[3]
gonggaofabushijian = new_td_list[4]
kaibiaoshijian = table.xpath("td[@name=\"openTime\"]/@id")[0]
url = table.xpath("td/a/@href")[0]
#print(url)
href_url = url.replace('javascript:urlOpen(\'','').replace('\')','')
#print(href_url)
#print(title, hangye, diqu, qudao, gonggaofabushijian,kaibiaoshijian,href_url)
info = [title, hangye, diqu, qudao, gonggaofabushijian,kaibiaoshijian,href_url]
all_info.append(info)
return all_info
def write_data(self,sheet, row, lst):
for data_infos in lst:
j = 0
for data in data_infos:
sheet.write(row, j, data)
j += 1
row += 1
def save(self,file_name, data):
if os.path.exists(file_name):
# 打开excel
rb = xlrd.open_workbook(file_name, formatting_info=True)
# 用 xlrd 提供的方法获得现在已有的行数
rn = rb.sheets()[0].nrows
# 复制excel
wb = copy(rb)
# 从复制的excel文件中得到第一个sheet
sheet = wb.get_sheet(0)
# 向sheet中写入文件
self.write_data(sheet, rn, data)
# 删除原先的文件
os.remove(file_name)
# 保存
wb.save(file_name)
else:
header = ['名称', '行业', '地区', '渠道', '公告发布时间', '开标时间', '链接url']
book = xlwt.Workbook(encoding='utf-8')
sheet = book.add_sheet('中国招标投标公共服务平台数据下载')
# 向 excel 中写入表头
for h in range(len(header)):
sheet.write(0, h, header[h])
# 向sheet中写入内容
self.write_data(sheet, 1, data)
book.save(file_name)
def run(self):
print('*' * 80)
print('\t\t\t\t中国招标投标公共服务平台数据下载')
print('作者:GRG 2019.8.05')
print('--------------')
path = (input('请输入要保存的地址(例如:C:\\Users\\xhdong1\\Desktop\\),不输入直接按回车表示默认当前位置:\n'))
file_name = path + '中国招标投标公共服务平台'+ '.xls'
for i in range(1, 1000):
print('正在爬取第{}页'.format(str(i)))
self.params['page'] = i
html = self.get_page(url=self.url, headers=self.headers, params=self.params)
# print(html)
try:
html = etree.HTML(html)
# print(html)
except Exception:
pass
all_info = self.parse_page(html)
self.save(file_name,all_info)
time.sleep(3)
if __name__ == '__main__':
zhaotoubiaopingtaiSpider = ZhaotoubiaopingtaiSpider()
zhaotoubiaopingtaiSpider.run()
这个脚本直接可以跑起来了。
感谢身边的每一个亲爱的人。