最近需要抓取一些信息,就了解了一下pyspider,不多说直接上代码,代码包括数据抓取,分析以及存入mysql数据库
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2017-09-06 09:53:44
# Project: dmp 登录抓取数据
from pyspider.libs.base_handler import *
import re
class Handler(BaseHandler):
crawl_config = {
'headers': {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (KHTML, like Gecko)',
}
}
@every(minutes=0)
def on_start(self):
self.crawl('http://http://xxxx.com/', callback=self.login)
@config(age=0)
def login(self, response):
cookies = response.cookies
url = response.doc("form").attr("action")
data = {}
for each in response.doc("form input"):
data[each.name]=each.value
if each.name == "username":
data["username"] = "test"
if each.type == "password":
data["password"] = "123456"
headers = {}
headers["Content-Type"]="application/x-www-form-urlencoded"
headers["Referer"]="http://xxx.com/login"
headers["Origin"]="http://xxx.com"
self.crawl(url, callback=self.login_ok,data=data,cookies=cookies,headers=headers,method="POST")
@config(priority=2)
def login_ok(self, response):
self.crawl("http://xxx.com/distributionGoods/list?pageSize=100&status=1",
cookies=response.cookies,callback=self.index_page)
@config(age=0)
def index_page(self, response):
base_url = 'http://xxx.com/'
for each in response.doc(".active p > a").items():
self.crawl( each.attr.href,cookies=response.cookies,callback=self.detail_page)
@config(age=0)
def detail_page(self, response):
return {
"url": response.url,
"title": response.doc('title').text(),
"productName" : response.doc('.separate-line > .column-col1 > div > div').eq(0).text(),
"price" : response.doc('.column-col3 > div > div').text(),
}
抓取数据入库
首先引入pymysql:
import pymysql
获取抓取结果:
@config(priority=2)
def detail_page(self, response):
return [{
"订单号" : x('td').eq(0).text(),
"订单ID" : x('td').eq(1).text(),
"订单号22" : x('td').eq(2).text(),
"订单日期" : x('td').eq(3).text(),
"订单状态" : x('td').eq(4).text(),
"省分" : x('td').eq(5).text(),
"地市" : x('td').eq(6).text(),
"商品类型" : x('td').eq(7).text(),
"商品名称" : x('td').eq(8).text(),
"套餐名称" : x('td').eq(9).text(),
"终端品牌" : x('td').eq(10).text(),
} for x in response.doc("tr").items()]
重写on_result方法
def on_result(self, result):
print("result: ", result)
config = {
'host':'127.0.0.1',
'port':3306,
'user':'admin',
'password':'123456',
'db':'sales',
'charset':'utf8mb4',
'cursorclass':pymysql.cursors.DictCursor,
}
#对数据进行预处理
if not result is None:
for x in result:
#if x["订单号"] is "":
#格式化日期
str = x["订单日期"]
if not str is "":
str = str[0:4] + "-" + str[4:6] + "-" + str[6:]
date_time = datetime.datetime.strptime(str,'%Y-%m-%d')
x["订单日期"] = date_time
else:
x["订单日期"] = None
#如果终端品牌为-,代表没有
if x["终端品牌"] is "-":
x["终端品牌"] = None
#如果套餐名称为-,代表没有
if x["套餐名称"] is "-":
x["套餐名称"] = None
#创建一个集合存放元组
list = []
for x in result:
obj = (
x["订单号"],
x["订单ID"],
x["订单号22"],
x["订单日期"],
x["订单状态"],
x["省分"],
x["地市"],
x["商品类型"],
x["商品名称"],
x["套餐名称"],
x["终端品牌"]
)
if not x["订单号"] is "":
list.append(obj)
db = pymysql.connect(**config)
cursor = db.cursor()
sql = """INSERT INTO ORDER_TMP(ID, TRADE_ID, ESS_TRADE_ID,
TRADE_DATE, TRADE_STATUS, PROVINCE, CITY, GOODS_TYPE,
GOODS_NAME, PACKAGE_NAME, NET_TYPE, TERMINAL_BRAND) VALUES
(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"""
print("sql: ", sql)
try:
cursor.executemany(sql, list)
db.commit()
except Exception as e:
print("exception have occur: ", e)
db.rollback()
db.close()