pyspider抓取数据信息

最近需要抓取一些信息,就了解了一下pyspider,不多说直接上代码,代码包括数据抓取,分析以及存入mysql数据库

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2017-09-06 09:53:44
# Project: dmp 登录抓取数据

from pyspider.libs.base_handler import *
import re


class Handler(BaseHandler):  
    crawl_config = {  
        'headers': {  
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (KHTML, like Gecko)',  
        }  
    }  


    @every(minutes=0)  
    def on_start(self):  
        self.crawl('http://http://xxxx.com/', callback=self.login)  

    @config(age=0)  
    def login(self, response):  
        cookies = response.cookies  
        url = response.doc("form").attr("action")  
        data = {}  
        for each in response.doc("form input"):  
            data[each.name]=each.value  
            if each.name == "username":  
                data["username"] = "test"  
            if each.type == "password":  
                data["password"] = "123456"  
        headers = {}  
        headers["Content-Type"]="application/x-www-form-urlencoded"  
        headers["Referer"]="http://xxx.com/login"  
        headers["Origin"]="http://xxx.com"   
        self.crawl(url, callback=self.login_ok,data=data,cookies=cookies,headers=headers,method="POST")  


    @config(priority=2)  
    def login_ok(self, response):            
        self.crawl("http://xxx.com/distributionGoods/list?pageSize=100&status=1",  
                   cookies=response.cookies,callback=self.index_page)  
    @config(age=0)       
    def index_page(self, response): 
        base_url = 'http://xxx.com/'  
        for each in response.doc(".active p > a").items():
             self.crawl( each.attr.href,cookies=response.cookies,callback=self.detail_page)

    @config(age=0)         
    def detail_page(self, response):
        return {
            "url": response.url,
            "title": response.doc('title').text(),
            "productName" : response.doc('.separate-line > .column-col1 > div > div').eq(0).text(),
            "price" : response.doc('.column-col3 > div > div').text(),
        }

抓取数据入库

首先引入pymysql:
    import pymysql
获取抓取结果:
@config(priority=2)
    def detail_page(self, response):
        return [{
            "订单号" : x('td').eq(0).text(),
            "订单ID" : x('td').eq(1).text(),
            "订单号22" : x('td').eq(2).text(),
            "订单日期" : x('td').eq(3).text(),
            "订单状态" : x('td').eq(4).text(),
            "省分" : x('td').eq(5).text(),
            "地市" : x('td').eq(6).text(),
            "商品类型" : x('td').eq(7).text(),
            "商品名称" : x('td').eq(8).text(),
            "套餐名称" : x('td').eq(9).text(),
            "终端品牌" : x('td').eq(10).text(),
        } for x in response.doc("tr").items()]

重写on_result方法

def on_result(self, result):
        print("result: ", result)

        config = {
          'host':'127.0.0.1',
          'port':3306,
          'user':'admin',
          'password':'123456',
          'db':'sales',
          'charset':'utf8mb4',
          'cursorclass':pymysql.cursors.DictCursor,
          }
        #对数据进行预处理
        if not result is None:
            for x in result:
                #if x["订单号"] is "":
                #格式化日期
                str = x["订单日期"]
                if not str is "":
                    str = str[0:4] + "-" + str[4:6] + "-" + str[6:]
                    date_time = datetime.datetime.strptime(str,'%Y-%m-%d')
                    x["订单日期"] = date_time
                else:
                    x["订单日期"] = None
                #如果终端品牌为-,代表没有
                if x["终端品牌"] is "-":
                    x["终端品牌"] = None
                #如果套餐名称为-,代表没有
                if x["套餐名称"] is "-":
                    x["套餐名称"] = None

            #创建一个集合存放元组
            list = []
            for x in result:
                obj = (
                    x["订单号"],
                    x["订单ID"],
                    x["订单号22"],
                    x["订单日期"],
                    x["订单状态"],
                    x["省分"],
                    x["地市"],
                    x["商品类型"],
                    x["商品名称"],
                    x["套餐名称"],
                    x["终端品牌"]
                    )
                if not x["订单号"] is "":
                    list.append(obj)

            db = pymysql.connect(**config)

            cursor = db.cursor()

            sql = """INSERT INTO ORDER_TMP(ID, TRADE_ID, ESS_TRADE_ID, 
                    TRADE_DATE, TRADE_STATUS, PROVINCE, CITY, GOODS_TYPE, 
                    GOODS_NAME, PACKAGE_NAME, NET_TYPE, TERMINAL_BRAND) VALUES 
                    (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"""

            print("sql: ", sql)

            try:
                cursor.executemany(sql, list)
                db.commit()
            except Exception as e:
                print("exception have occur: ", e)
                db.rollback()

            db.close()

你可能感兴趣的:(python,python,mysql)