爬虫-政府采购网数据抓取(2018-12-28)

文章目录

    • 前言
    • 爬虫思路
      • 目标网站
      • 思路
    • 技术亮点
    • 脚本内容
    • 致谢

前言

今天心情好,一来是因为马上2019年了,二来是因为自然开心。
于是乎,把自己工作写的一个爬虫脚本分享出来。

爬虫思路

目标网站

http://search.ccgp.gov.cn/bxsearch?searchtype=1&page_index=1&start_time=&end_time=&timeType=2&searchparam=&searchchannel=0&dbselect=bidx&kw=&bidSort=0&pinMu=0&bidType=0&buyerName=&projectId=&displayZone=&zoneId=&agentName=

思路

  1. 先拿到大页面的链接,然后循环遍历解析小页面的数据
  2. 解析完的数据保存为python的字典,然后往mysql写入

技术亮点

  1. 写了一个多线程解析,提高效率

以下是片段代码

# 创建线程
threads = []
files = range(len(url_list))
for url in url_list:
    t = threading.Thread(target=self.start(url), args=url)
    threads.append(t)

# 启动线程
for i in files:
    threads[i].start()
for i in files:
    threads[i].join()
  1. 可以不限字段地往mysql写入

脚本内容

分为三个脚本。分别是config.py, mysql.py,zhengfucaigouspider.py

MYSQL_HOST = '10.1.40.206'

MYSQL_PORT = 3306

MYSQL_USER = 'root'

MYSQL_PASSWORD = 'root123'

MYSQL_DATABASE = 'zhaobiao'
import pymysql

from com.grg.spider.zhaobiao.config import *


class MySQL():
    def __init__(self, host=MYSQL_HOST,
                 username=MYSQL_USER, password=MYSQL_PASSWORD,
                 port=MYSQL_PORT, database=MYSQL_DATABASE):
        try:
            self.db = pymysql.connect(host, username, password, database, charset='utf8', port=port)
            self.cursor = self.db.cursor()
        except pymysql.MySQLError as e:
            print(e.args)

    def insert(self, table, data):
        #print(data.keys())
        # lst = []
        # for key in data.keys():
        #     lst.append(key)
        # print(lst)
        keys = ','.join(data.keys())
        values = ','.join(['%s'] * len(data))
        sql_query = 'insert into %s (%s) values (%s)' % (table, keys, values)
        try:
            self.cursor.execute(sql_query, tuple(data.values()))
            self.db.commit()
        except pymysql.MySQLError as e:
            print(e.args)
            self.db.rollback()

#coding:utf-8
import datetime
import json
import re
import threading
import time

import requests
from lxml import etree
from com.grg.spider.zhaobiao.mysql import MySQL
class ZhenfucaigouSpider():
    url = 'http://search.ccgp.gov.cn/bxsearch?searchtype=1'
    keyword = '银行'
    start_time = '2018:06:01'
    end_time = '2018:12:29'
    page_num = 1

    params = {
        'searchtype': '1',
        'page_index': page_num,
        'bidSort': '0',
        'pinMu': '0',
        'bidType': '0',
        'kw': keyword,
        'start_time': start_time,
        'end_time': end_time,
        'timeType': '6'
    }
    headers = {
        'Cookie': 'JSESSIONID=EgPd86-6id_etA2QDV31Kks3FrNs-4gwHMoSmEZvnEktWIakHbV3!354619916; Hm_lvt_9f8bda7a6bb3d1d7a9c7196bfed609b5=1545618390; Hm_lpvt_9f8bda7a6bb3d1d7a9c7196bfed609b5=1545618390; td_cookie=2144571454; Hm_lvt_9459d8c503dd3c37b526898ff5aacadd=1545611064,1545618402,1545618414; Hm_lpvt_9459d8c503dd3c37b526898ff5aacadd=1545618495',
        'Host': 'search.ccgp.gov.cn',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3141.8 Safari/537.36'
    }
    mysql = MySQL()

    def get_page(self,url,headers,params):
        try:
            response = requests.get(url,headers=headers,params=params)
            if response.status_code == 200:
                html = response.content.decode('utf-8', 'ignore').replace(u'\xa9', u'')
                #print(html)
                return html
            else:
                print(response.status_code)
        except requests.ConnectionError:
            return None

    def get_detail_page(self,url):
        try:
            response = requests.get(url)
            if response.status_code == 200:
                html = response.content.decode('utf-8', 'ignore').replace(u'\xa9', u'')
                #print(html)
                return html
        except requests.ConnectionError:
            return None


    def get_all_url(self,html):
        pattern1 = '<.*?(href=".*?htm").*?'
        href_url = re.findall(pattern1, html, re.I)
        # print(href_url)
        url_list = []
        for url in href_url:
            url1 = url.replace('href=','').replace('"','')
            url_list.append(url1)
        return url_list

    def parse_datail_page(self,html):
        table_list = html.xpath('//div[@class="table"]//tr')
        #print(table_list)
        all_info = {}
        for table in table_list:
            if len(table.xpath('td[@class="title"]/text()'))>0:
                #print(''.join(table.xpath('td[@class="title"]/text()'))+":"+''.join(table.xpath('td[@colspan="3"]/text()')))
                title = ''.join(table.xpath('td[@class="title"]/text()'))
                value = ''.join(table.xpath('td[@colspan="3"]/text()'))
                if (title.find('附件')==0):
                    value = 'http://www.ccgp.gov.cn/oss/download?uuid='+''.join(table.xpath('td[@colspan="3"]/a/@id'))
                    #print(title+value)
                if ('公告时间' in title):
                    title = '公告时间'
                    value = table.xpath('td[@width="168"]/text()')[1]
                    district_key = '行政区域'
                    district_value = (table.xpath('td[@width="168"]/text()'))[0]
                    all_info[district_key]=district_value
                if '本项目招标公告日期中标日期' in title :
                    title = '本项目招标公告日期'
                    value = table.xpath('td[@width="168"]/text()')[0]
                    zhongbiaoriqi_key = '中标日期'
                    zhongbiaoriqi_value = table.xpath('td[@width="168"]/text()')[1]
                    all_info[zhongbiaoriqi_key]=zhongbiaoriqi_value
                    #print('中标日期'+zhongbiaoriqi_value)
                if '本项目招标公告日期成交日期' in title:
                    title = '本项目招标公告日期'
                    value = table.xpath('td[@width="168"]/text()')[0]
                    zhongbiaoriqi_key = '中标日期'
                    zhongbiaoriqi_value = ''.join(table.xpath('td[@width="168"]/text()'))[11:]
                    #print('zhongbiaoriqi_value:'+zhongbiaoriqi_value)
                    all_info[zhongbiaoriqi_key] = zhongbiaoriqi_value
                all_info[title] = value
                all_info['插入时间']= datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        return all_info
        #return json.dumps(all_info,ensure_ascii=False)


    def start(self,url):
        time.sleep(3)
        # print(url)
        html = self.get_detail_page(url)
        # print(html)
        html = etree.HTML(html)
        all_info = self.parse_datail_page(html)
        print(all_info)
        #print(all_info.keys())
        self.mysql.insert('zhenfucaigou',all_info)



    def run(self):
        for i in range(1,200):
            print('正在爬取第{}页'.format(str(i)))
            self.params['page_index']=i
            html = self.get_page(url=self.url, headers=self.headers, params=self.params)
            # print(html)
            url_list = self.get_all_url(html)

            # 创建线程
            threads = []
            files = range(len(url_list))
            for url in url_list:
                t = threading.Thread(target=self.start(url), args=url)
                threads.append(t)

            # 启动线程
            for i in files:
                threads[i].start()
            for i in files:
                threads[i].join()





if __name__ == '__main__':
    zhenfucaigouSpider = ZhenfucaigouSpider()
    zhenfucaigouSpider.run()

致谢

代码写得有些粗糙,各位看官们多多包涵哈。
非常感谢生活赠与的一切!

你可能感兴趣的:(python爬虫)