首先下载gooseeker抓取环境,到官网http://www.gooseeker.com/下载相关环境,根据教程文档http://www.gooseeker.com/tuto/tutorial.html创建抓取规则,并保存。这里尝试爬取的是某猫的搜索“笔记本”的网页https://list.tmall.com/search_product.htm?type=pc&q=%B1%CA%BC%C7%B1%BE&totalPage=1&sort=s&style=g&from=mallfp..pc_1_searchbutton
参数totalPage是页数,总共100页,我这是利用一个循环爬取每页的数据,亦可通过 教程中的翻页和层级 爬取所有数据
我的python环境是2.7的,使用下面2.7的的gooseeker的接口,返回出来的是定义规则的xml列表
#!/usr/bin/python
# -*- coding: utf-8 -*-
# 模块名: gooseeker_py2
# 类名: GsExtractor
# Version: 2.0
# 适配Python版本: 2.7
# 说明: html内容提取器
# 功能: 使用xslt作为模板,快速提取HTML DOM中的内容。
# released by 集搜客(http://www.gooseeker.com) on May 18, 2016
# github: https://github.com/FullerHua/jisou/core/gooseeker_py2.py
from urllib2 import urlopen
from urllib import quote
from lxml import etree
class GsExtractor(object):
def _init_(self):
self.xslt = ""
# 从文件读取xslt
def setXsltFromFile(self , xsltFilePath):
file = open(xsltFilePath , 'r')
try:
self.xslt = file.read()
finally:
file.close()
# 从字符串获得xslt
def setXsltFromMem(self , xsltStr):
self.xslt = xsltStr
# 通过GooSeeker API接口获得xslt
def setXsltFromAPI(self , APIKey , theme, middle=None, bname=None):
apiurl = "http://www.gooseeker.com/api/getextractor?key="+ APIKey +"&theme="+quote(theme)
if (middle):
apiurl = apiurl + "&middle="+quote(middle)
if (bname):
apiurl = apiurl + "&bname="+quote(bname)
apiconn = urlopen(apiurl)
self.xslt = apiconn.read()
# 返回当前xslt
def getXslt(self):
return self.xslt
# 提取方法,入参是一个HTML DOM对象,返回是提取结果
def extract(self , html):
xslt_root = etree.XML(self.xslt)
transform = etree.XSLT(xslt_root)
result_tree = transform(html)
return result_tree
通过上述接口获取xml数据,使用 import xml.dom.minidom解析xml数据,如下代码
# -*- encoding=utf-8 -*-
# crawler_gooseeker_bbs.py
# 版本: V1.0
## 设置xslt抓取规则,第一个参数是app key 817b2794557252f2f46fb88733079c07,请到会员中心申请 规则名
## xml解析教程 http://www.runoob.com/python/python-xml.html
from urllib2 import urlopen
from lxml import etree
import tmallInfo
import GsExtractor
import xml.dom.minidom
import MySQLdb
conn = MySQLdb.connect(
host='localhost',
port=3306,
user='root',
passwd='*****', #mysql的密码
db='*****', #mysql的数据库名
charset="utf8" #这里设置格式,报这个错mySql默认设置 latin-1 ,UnicodeEncodeError: 'latin-1' codec can't encode characters in position 0-3: ordinal not in range(256)
)
cur = conn.cursor()
cur.execute(
"create table if not exists tmallInfo(id int unsigned not null auto_increment primary key,pcName varchar(100),pcPrice varchar(30),storeName varchar(30),shopLink varchar(250),monthlyTurnover varchar(30),imgSrc varchar(150))")
tmallInfo.conn = conn
#这里可用线程优化 爬虫速度 http://www.runoob.com/python/python-multithreading.html
for n in range(1,101,1):
url = "https://list.tmall.com/search_product.htm?type=pc&q=%B1%CA%BC%C7%B1%BE&totalPage=" + str(n) + "&sort=s&style=g&from=mallfp..pc_1_searchbutton"
conn = urlopen(url)
doc = etree.HTML(unicode(conn.read(), "gbk")) # 乱码
bbsExtra = GsExtractor()
bbsExtra.setXsltFromAPI("817b2794557252f2f46fb88733079c07", "lyTmall") # 设置xslt抓取规则,第一个参数是app key,请到会员中心申请
result = bbsExtra.extract(doc) # 调用extract方法提取所需内容
DOMTREE = xml.dom.minidom.parseString(str(result))
pcList = DOMTREE.documentElement
items = pcList.getElementsByTagName("item")
for item in items:
pcName = item.getElementsByTagName('pcName')[0].childNodes[0].data
pcPrice = item.getElementsByTagName('pcPrice')[0].childNodes[0].data
storeName = item.getElementsByTagName('StoreName')[0].childNodes[0].data
shopLink = item.getElementsByTagName('ShopLink')[0].childNodes[0].data
monthlyTurnover = item.getElementsByTagName('monthlyTurnover')[0].childNodes[0].data
imgSrcs = item.getElementsByTagName('imgSrc')[0]
if len(imgSrcs.childNodes) > 0:
imgSrc = imgSrcs.childNodes[0].data
else:
imgSrc = "null"
info = tmallInfo(pcName, pcPrice, storeName, shopLink, monthlyTurnover, imgSrc)
info.save()
tmallInfo.conn.commit()
print n
tmallInfo.conn.close()
将爬取的数据保存到mysql数据库,总共5500多条数据,在保存时创建一个bean对象,方便保存,如下:
# -*- coding:utf-8 -*-
#myspl环境下载 https://pypi.python.org/pypi/MySQL-python#downloads
import MySQLdb
class tmallInfo:
conn = None;
def __init__(self,pcName,pcPrice,storeName,shopLink,monthlyTurnover,imgSrc):
self.pcName = pcName
self.pcPrice = pcPrice
self.storeName = storeName
self.shopLink = shopLink
self.monthlyTurnover = monthlyTurnover
self.imgSrc = imgSrc
def save(self):
cur = tmallInfo.conn.cursor()
# %s 为字符串占位符 name age为表中的属性字段,写死
sql = "insert into tmallInfo(pcName,pcPrice,storeName,shopLink,monthlyTurnover,imgSrc) values(%s,%s,%s,%s,%s,%s)"
param = (self.pcName,self.pcPrice,self.storeName,self.shopLink,self.monthlyTurnover,self.imgSrc)
n = cur.execute(sql, param)
先新建一个对象,再调用save将其保存到数据库中
info = tmallInfo(pcName, pcPrice, storeName, shopLink, monthlyTurnover, imgSrc)
info.save()
将上述三个py文件放到一个目录中即可使用