今天心情好,一来是因为马上2019年了,二来是因为自然开心。
于是乎,把自己工作写的一个爬虫脚本分享出来。
http://search.ccgp.gov.cn/bxsearch?searchtype=1&page_index=1&start_time=&end_time=&timeType=2&searchparam=&searchchannel=0&dbselect=bidx&kw=&bidSort=0&pinMu=0&bidType=0&buyerName=&projectId=&displayZone=&zoneId=&agentName=
以下是片段代码
# 创建线程
threads = []
files = range(len(url_list))
for url in url_list:
t = threading.Thread(target=self.start(url), args=url)
threads.append(t)
# 启动线程
for i in files:
threads[i].start()
for i in files:
threads[i].join()
分为三个脚本。分别是config.py, mysql.py,zhengfucaigouspider.py
MYSQL_HOST = '10.1.40.206'
MYSQL_PORT = 3306
MYSQL_USER = 'root'
MYSQL_PASSWORD = 'root123'
MYSQL_DATABASE = 'zhaobiao'
import pymysql
from com.grg.spider.zhaobiao.config import *
class MySQL():
def __init__(self, host=MYSQL_HOST,
username=MYSQL_USER, password=MYSQL_PASSWORD,
port=MYSQL_PORT, database=MYSQL_DATABASE):
try:
self.db = pymysql.connect(host, username, password, database, charset='utf8', port=port)
self.cursor = self.db.cursor()
except pymysql.MySQLError as e:
print(e.args)
def insert(self, table, data):
#print(data.keys())
# lst = []
# for key in data.keys():
# lst.append(key)
# print(lst)
keys = ','.join(data.keys())
values = ','.join(['%s'] * len(data))
sql_query = 'insert into %s (%s) values (%s)' % (table, keys, values)
try:
self.cursor.execute(sql_query, tuple(data.values()))
self.db.commit()
except pymysql.MySQLError as e:
print(e.args)
self.db.rollback()
#coding:utf-8
import datetime
import json
import re
import threading
import time
import requests
from lxml import etree
from com.grg.spider.zhaobiao.mysql import MySQL
class ZhenfucaigouSpider():
url = 'http://search.ccgp.gov.cn/bxsearch?searchtype=1'
keyword = '银行'
start_time = '2018:06:01'
end_time = '2018:12:29'
page_num = 1
params = {
'searchtype': '1',
'page_index': page_num,
'bidSort': '0',
'pinMu': '0',
'bidType': '0',
'kw': keyword,
'start_time': start_time,
'end_time': end_time,
'timeType': '6'
}
headers = {
'Cookie': 'JSESSIONID=EgPd86-6id_etA2QDV31Kks3FrNs-4gwHMoSmEZvnEktWIakHbV3!354619916; Hm_lvt_9f8bda7a6bb3d1d7a9c7196bfed609b5=1545618390; Hm_lpvt_9f8bda7a6bb3d1d7a9c7196bfed609b5=1545618390; td_cookie=2144571454; Hm_lvt_9459d8c503dd3c37b526898ff5aacadd=1545611064,1545618402,1545618414; Hm_lpvt_9459d8c503dd3c37b526898ff5aacadd=1545618495',
'Host': 'search.ccgp.gov.cn',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3141.8 Safari/537.36'
}
mysql = MySQL()
def get_page(self,url,headers,params):
try:
response = requests.get(url,headers=headers,params=params)
if response.status_code == 200:
html = response.content.decode('utf-8', 'ignore').replace(u'\xa9', u'')
#print(html)
return html
else:
print(response.status_code)
except requests.ConnectionError:
return None
def get_detail_page(self,url):
try:
response = requests.get(url)
if response.status_code == 200:
html = response.content.decode('utf-8', 'ignore').replace(u'\xa9', u'')
#print(html)
return html
except requests.ConnectionError:
return None
def get_all_url(self,html):
pattern1 = '<.*?(href=".*?htm").*?'
href_url = re.findall(pattern1, html, re.I)
# print(href_url)
url_list = []
for url in href_url:
url1 = url.replace('href=','').replace('"','')
url_list.append(url1)
return url_list
def parse_datail_page(self,html):
table_list = html.xpath('//div[@class="table"]//tr')
#print(table_list)
all_info = {}
for table in table_list:
if len(table.xpath('td[@class="title"]/text()'))>0:
#print(''.join(table.xpath('td[@class="title"]/text()'))+":"+''.join(table.xpath('td[@colspan="3"]/text()')))
title = ''.join(table.xpath('td[@class="title"]/text()'))
value = ''.join(table.xpath('td[@colspan="3"]/text()'))
if (title.find('附件')==0):
value = 'http://www.ccgp.gov.cn/oss/download?uuid='+''.join(table.xpath('td[@colspan="3"]/a/@id'))
#print(title+value)
if ('公告时间' in title):
title = '公告时间'
value = table.xpath('td[@width="168"]/text()')[1]
district_key = '行政区域'
district_value = (table.xpath('td[@width="168"]/text()'))[0]
all_info[district_key]=district_value
if '本项目招标公告日期中标日期' in title :
title = '本项目招标公告日期'
value = table.xpath('td[@width="168"]/text()')[0]
zhongbiaoriqi_key = '中标日期'
zhongbiaoriqi_value = table.xpath('td[@width="168"]/text()')[1]
all_info[zhongbiaoriqi_key]=zhongbiaoriqi_value
#print('中标日期'+zhongbiaoriqi_value)
if '本项目招标公告日期成交日期' in title:
title = '本项目招标公告日期'
value = table.xpath('td[@width="168"]/text()')[0]
zhongbiaoriqi_key = '中标日期'
zhongbiaoriqi_value = ''.join(table.xpath('td[@width="168"]/text()'))[11:]
#print('zhongbiaoriqi_value:'+zhongbiaoriqi_value)
all_info[zhongbiaoriqi_key] = zhongbiaoriqi_value
all_info[title] = value
all_info['插入时间']= datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
return all_info
#return json.dumps(all_info,ensure_ascii=False)
def start(self,url):
time.sleep(3)
# print(url)
html = self.get_detail_page(url)
# print(html)
html = etree.HTML(html)
all_info = self.parse_datail_page(html)
print(all_info)
#print(all_info.keys())
self.mysql.insert('zhenfucaigou',all_info)
def run(self):
for i in range(1,200):
print('正在爬取第{}页'.format(str(i)))
self.params['page_index']=i
html = self.get_page(url=self.url, headers=self.headers, params=self.params)
# print(html)
url_list = self.get_all_url(html)
# 创建线程
threads = []
files = range(len(url_list))
for url in url_list:
t = threading.Thread(target=self.start(url), args=url)
threads.append(t)
# 启动线程
for i in files:
threads[i].start()
for i in files:
threads[i].join()
if __name__ == '__main__':
zhenfucaigouSpider = ZhenfucaigouSpider()
zhenfucaigouSpider.run()
代码写得有些粗糙,各位看官们多多包涵哈。
非常感谢生活赠与的一切!