pyspider的一些简单应用

pyspider

  • 不要框架获取部分网页内容
url = "https://www.creditchina.gov.cn/xinxigongshi/shipinanquanjianduchoujian/jieguoxiangqing/index.html?id=62335&dataType=1"
# #         print("aaaaaa")
# #         html = requests.get(url, headers=headers, proxies=proxies )
# #         html.encoding = "utf-8"
# #
# #         content = pq(str(html.text))('div.content.clearfix > div > div > div.result-tab.result-tab1')
# #         print(content)
  • 路径
    C:\Users\YScredit\AppData\Roaming\Python\Python35\site-packages\pyspider
  • 打印页面
    response.text()
  • 制定数据库和表的编码格式
    CREATE DATABASE dbtest CHARACTER SET utf8 COLLATE utf8_general_ci;
CREATE TABLE tbtest(
NAME VARCHAR(111),
TYPE VARCHAR(111),
num VARCHAR(111),
address VARCHAR(111),
TIME VARCHAR(111)
)CHARACTER SET utf8 COLLATE utf8_general_ci;

实例no.1


#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2017/12/20 9:22
# Project: bzxr_fj_fz
# @Author  : YPC
# @Email   : [email protected]


from pyspider.libs.base_handler import *
from pyquery import PyQuery as pq
import pymysql
from six import itervalues
import time
import datetime
import re


# 福建省福州法院网被执行人信息
class Handler(BaseHandler):
    crawl_config = {
        'itag': 'bzxr-fuzhou-0.7',
        'time_out': 4000,
        # 'proxy': 'http://yscredit:[email protected]:3129'
    }

    # 所要爬取的字段属性定义

    @every(minutes=24 * 60)
    @config(age=12 * 60 * 60)
    def on_start(self):
        headers = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Cache-Control': 'max-age=0',
            'Connection': 'keep-alive',
            'Host': 'fzszy.chinacourt.org',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36'
        }
        basic_url = 'http://fzszy.chinacourt.org/public/more.php?LocationID=0704000000'
        self.crawl(url=basic_url, headers=headers, callback=self.basic_page)

    @config(age=12 * 60 * 60)
    def basic_page(self, response):
        dic = {
            'id': '',
            'name': '',
            'case_code': '',
            'name_id': '',
            'itype': '',
            'card_num': '',
            'business_entity': '',
            'sex': '',
            'age': '',
            'address': '',
            'execute_money_backup': '',
            'unexecute_money_backup': '',
            'reg_date': '',
            'court_name': '',
            'org_url': '',
            'source': '福州法院网',
            'case_id': '',
            'exp': '1'
        }
        try:
            basic_url = response.url
            basic_trs = response.doc('tr.tr_odd td.td_line')

            headers = {
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
                'Accept-Encoding': 'gzip, deflate',
                'Accept-Language': 'zh-CN,zh;q=0.9',
                'Cache-Control': 'max-age=0',
                'Connection': 'keep-alive',
                'Host': 'fzszy.chinacourt.org',
                'Referer': basic_url,
                'Upgrade-Insecure-Requests': '1',
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36'
            }
            for each_tr in basic_trs.items():
                org_url = each_tr('a').attr.href

                dic['org_url'] = org_url
                self.crawl(url=org_url, headers=headers, callback=self.detail_page,
                           save=dic)

        except Exception as e:
            dic['exp'] = '0'
            if dic['name'] != '':
                yield dic

    @config(priority=2)
    @config(age=12 * 60 * 60)
    def detail_page(self, response):
        response.encoding = 'gbk'
        dic = response.save
        try:
            detail_trs = response.doc(' span.detail_content > strong > table  ')

            detail_tr = detail_trs('tr:not(:first-child):not(:nth-child(2))')

            for each_tr in detail_tr.items():
                dic['id'] = each_tr('td:nth-child(1)').text().replace(' ', '')
                dic['name'] = each_tr('td:nth-child(2)').text().replace(' ', '')
                dic['name_id'] = each_tr('td:nth-child(3)').text().replace(' ', '')
                dic['address'] = each_tr('td:nth-child(4)').text().replace(' ', '')
                dic['execute_money_backup'] = each_tr('td:nth-child(5)').text().replace(' ', '')

                yield dic

        except Exception as e:
            dic['exp'] = '0'
            if dic['name'] != '':
                yield dic

    def on_result(self, result):
        for i in result:
            print(i + "  " + result[i])
        self.save_mysql(result)

        if not result:
            return
    def save_mysql(self,item):
        connect = pymysql.connect(host="192.168.59.128",
                                    user="root",
                                    password="123456",
                                    db="demo1",
                                    charset="utf8",
                                    use_unicode=False
                                    )
        cursor = connect.cursor()



        positionName = item["address"]
        positionLink = item['name']
        positionType = item['source']
        peopleNumber = item['execute_money_backup']
        workLocation = item['card_num']
        publishTime = item['itype']
        sql = "insert into tencent (positionName,positionLink,positionType," \
              "peopleNumber,workLocation,publishTime)VALUES(%s,%s,%s,%s,%s,%s)"
        lis = (positionName,positionLink,positionType,peopleNumber,workLocation,publishTime)
        cursor.execute(sql,lis)
        connect.commit()
        cursor.close()
        connect.close()

实例- 爬取腾讯招聘

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2017/12/20 9:22
# Project: bzxr_fj_fz
# @Author  : YPC
# @Email   : [email protected]


from pyspider.libs.base_handler import *
from pyquery import PyQuery as pq
import pymysql
from six import itervalues
import time
import datetime
import re


# 福建省福州法院网被执行人信息
class Handler(BaseHandler):
    crawl_config = {
        'itag': 'bzxr-fuzhou-0.8',
        'time_out': 4000,
        'proxy': 'H21WNK49K6PFSR3P:[email protected]:9010'
    }

    # 所要爬取的字段属性定义

    @every(minutes=24 * 60)
    @config(age=12 * 60 * 60)
    def on_start(self):

        basic_url = 'https://hr.tencent.com/position.php?&start=#a0'
        self.crawl(url=basic_url, callback=self.basic_page, validate_cert=False)

    @config(age=12 * 60 * 60)
    def basic_page(self, response):
        dic = {
            'name': '',
            'type': '',
            'num': '',
            'address': '',
            'time': '',

        }
        baseUrl = "https://hr.tencent.com/position.php?&start={}#a"
        baseUrl1 = "https://hr.tencent.com/position.php?&start=10#a"
        countAll = int(response.doc(" div.pagenav> a:nth-child(10)").text())
        num = 0
        while num < countAll * 10:
            url = baseUrl.format(num)
            self.crawl(url=url, validate_cert=False, callback=self.detail_page)
            num = num + 10

        # basic_trs = response.doc('#next')
        # print(basic_trs)
        # org_url = basic_trs.attr.href
        # self.crawl(url=org_url, validate_cert=False, callback=self.basic_page
        #            )
        # self.crawl(url=response.url, validate_cert=False, callback=self.detail_page, save=dic
        #            )

    @config(priority=2)
    @config(age=12 * 60 * 60)
    def detail_page(self, response):

        # dic = response.save
        # print(response.text)

        response.encoding = 'utf-8'

        try:
            detail_trs = response.doc('#position > div.left.wcont_b.box > table  ')

            detail_tr = detail_trs('tr:not(:first-child):not(:last-child)')

            for each_tr in detail_tr.items():
                print("aaaaaaaaaaaa")
                dic = {
                    'name': '',
                    'type': '',
                    'num': '',
                    'address': '',
                    'time': '',
                }
                dic['name'] = each_tr('td:nth-child(1)').text().replace(' ', '')
                if len(each_tr('td:nth-child(2)').text().replace(' ', '')) == 0:
                    dic['type'] = "null"
                else:
                    dic['type'] = each_tr('td:nth-child(2)').text().replace(' ', '')
                dic['num'] = each_tr('td:nth-child(3)').text().replace(' ', '')
                dic['address'] = each_tr('td:nth-child(4)').text().replace(' ', '')
                dic['time'] = each_tr('td:nth-child(5)').text().replace(' ', '')
                # print(dic)
                yield dic

        except Exception as e:
            dic['exp'] = '0'
            if dic['name'] != '':
                yield dic

    def on_result(self, result):
        print(result)

        if not result:
            return
        self.insert_text(tablename='tbtest', **result)
        self.save_mysql(result)

    def escape(self, string):
        return '%s' % string

    def insert_text(self, tablename=None, **values):
        kwargs = {
            'host': '192.168.59.128',
            'user': 'root',
            'passwd': '123456',
            'db': 'dbtest',
            'charset': 'utf8'
        }

        tablename = self.escape(tablename)
        cnx = pymysql.connect(**kwargs)
        cur = cnx.cursor()
        if values:
            _keys = ",".join(self.escape(k) for k in values)
            _values = ",".join(['%s', ] * len(values))
            print(list(itervalues(values)))
            sql_query = "INSERT IGNORE INTO %s (%s) values (%s)" % (tablename, _keys, _values)

            print(sql_query, list(itervalues(values)))
        try:
            if values:
                print(cur.execute(sql_query, list(itervalues(values))))
            cnx.commit()
            return True
        except Exception as e:
            print(e)
            return False

    # def on_result(self, result):
    #     # self.save_mysql(result)
    #     if not result:
    #         return
    #     self.insert_text(tablename="tbtest", **result)
    #
    #
    #
    # def insert_text(self, tablename=None, **values):
    #     kwargs = {
    #         'host': '192.168.59.128',
    #         'user': 'root',
    #         'passwd': '123456',
    #         'db': 'dbtest',
    #         'charset': 'utf8'
    #     }
    #
    #     # tablename = self.escape(tablename)
    #     cnx = pymysql.connect(**kwargs)
    #     cur = cnx.cursor()
    #     if values:
    #         _keys = ",".join(self.escape(k) for k in values)
    #         _values = ",".join(['%s', ] * len(values))
    #         print(list(itervalues(values)))
    #         sql_query = "INSERT IGNORE INTO %s (%s) values (%s)" % (tablename, _keys, _values)
    #
    #         print(sql_query, list(itervalues(values)))
    #     try:
    #         if values:
    #             print(cur.execute(sql_query, list(itervalues(values))))
    #         cnx.commit()
    #         return True
    #     except Exception as e:
    #         print(e)
    #         return False
    #
    # def escape(self, string):
    #     return '%s' % string

    def save_mysql(self, item):
        connect = pymysql.connect(host="192.168.59.128",
                                  user="root",
                                  password="123456",
                                  db="dbtest",
                                  charset="utf8",
                                  use_unicode=False
                                  )
        cursor = connect.cursor()

        name = item["name"]
        type = item['type']
        num = item['num']
        address = item['address']
        time = item['time']
        print("aaaaaaaaaaaaaaa")
        sql = "insert into tbtest (name,type,num," \
              "address,time)VALUES(%s,%s,%s,%s,%s)"
        lis = (name, type, num, address, time)
        cursor.execute(sql, lis)
        connect.commit()
        cursor.close()
        connect.close()

你可能感兴趣的:(爬虫)