易迅商品抓取
几个性能问题:
1. 商品图像信息加入pgsql, 字段类型: model.BinaryField (django 1.6)
2. django1.6提供数据库长连接功能 CONN_MAX_AGE 参数,可加快访问速度
3. gevent.spawn() 创建所有task,将导致进程被直接Kill掉,因为内存疯长,所涉及的资源达到顶端将被OS杀死,包括:内存overload,file handle...
使用 gevent.pool.Pool控制并发数量
4. postgresql.conf 修改max_connections参数到300并发数
5. 导入数据时,可先去除Table的索引,加快插入速度
6 . django关闭DEBUG模式(会导致内存泄露不释放)
# --coding:utf-8--
'''
yixun_crawler - 易迅网站商品信息收集器
author: scott
date:
目前仅考虑单点设备登录
lxml 的xpath还存在部分问题(还是自己没完全领会呢?)
import imp
imp.load_source( ' init ', ' ../init_script.py ')
import gevent
import os,os.path,sys,struct,time,traceback,signal,threading,copy,base64,datetime
# from django.db import connection
# from django.db.models import Sum
from django.db import transaction
import yixun.models as yixun
from bson.objectid import ObjectId
import lxml.etree as etree
import urllib2,urlparse
dbfile = ' goods.txt '
fdbfile = open(dbfile, ' wb ')
class ResourceItem:
def __init__(self,text,href=None,tag=None,parent=None):
self.text = text
self.tag=tag
self.href=href
self.children=[]
self.parent = parent
def scrape_page(url,pageIndex,cat1,cat2,cat3):
print ' scrape_page: ',url
req = urllib2.urlopen(url)
data = req.read()
# savefile(data)
html = etree.HTML(data.decode( ' utf-8 '))
# page size
curPage = 0
r = html.xpath( ' //*[@id="list"]/div[5]/div[2]/span/b/text() ')
if not r: return False
curPage = r[0]
r = html.xpath( ' //*[@id="list"]/div[5]/div[2]/span/text() ')
if not r : return False
pageNum = int(r[0][1:])
print pageNum,curPage
# 有一种情况,传入大于总page数量的值,server会返回第一个page
if pageIndex > pageNum:
return False
# 检索品牌
goods = html.xpath(u " //div[@class='mod_goods'] ")
if not goods:
print ' skipped.. '
return False
for g in goods:
for e in g.getchildren():
if e.get( ' class ') == ' mod_goods_info ': # 一下search动作用xpath无法实现,所以只好挨个查找
name = ''
price =None
link = ''
for p in e.getchildren():
if p.get( ' class ')== ' mod_goods_tit ':
a= p.getchildren()[0]
name = a.text.encode( ' utf-8 ')
link = a.get( ' href ')
if p.get( ' class ')== ' mod_goods_price ':
price = p.getchildren()[0].getchildren()[1].text.encode( ' utf-8 ')
if name and price and link:
# print name , price ,link
text = " %s || %s || %s || %s || %s || %s\n "%(cat1,cat2,cat3,name,price,link.strip())
print text
gitem = yixun.GoodsItem()
gitem.cat1 = cat1
gitem.cat2 = cat2
gitem.cat3 = cat3
gitem.name = name
gitem.cat5 = link
try:
gitem.price = float(price)
except:
pass
gitem.save()
# fdbfile.write(text)
# fdbfile.flush()
return True
# ss= p.xpath('..//dd/a')
'''
http://searchex.yixun.com/705740t705741-1-/?YTAG=2.1738456040037
http://searchex.yixun.com/html?path=705740t705741&area=1&sort=0&show=0&page=2&size=40&pf=0&as=0&charset=utf-8&YTAG=2.1738456040037#list
http://searchex.yixun.com/html?path=705740t705741&area=1&sort=0&show=0&page=1&size=40&pf=0&as=0&charset=utf-8&YTAG=2.1738456040037#list
'''
def scrape_cat(cat,yPageId,yPageLevel,tag,cat1,cat2,cat3):
try:
print cat.href
# parse url
url = cat.href
fs = urlparse.urlparse(url)
path,qs=fs[2],fs[4]
cat_idx = path[1:].split( ' - ')[0]
# tag = qs.split('=')[1]
tag = " %s.%s%s "%(yPageLevel,yPageId,tag)
# make path url
for page in range(1,500):
url = " http://searchex.yixun.com/html?path=%s&area=1&sort=0&show=0&page=%s&size=40&pf=0&as=0&charset=utf-8&YTAG=%s#list "%(cat_idx,page,tag)
if not scrape_page(url,page,cat1,cat2,cat3):
break
return
except:
traceback.print_exc()
# print 'page is null,skipped..'
def savefile(d,filename= ' sample.html '):
f = open(filename, ' w ')
f.write(d)
f.close()
def test():
try:
url = ' http://searchex.yixun.com/705740t705741-1-/?YTAG=2.1738456040037 '
fs = urlparse.urlparse(url)
path,qs=fs[2],fs[4]
cat_idx = path[1:].split( ' - ')[0]
tag = qs.split( ' = ')[1]
print cat_idx,tag
return
all_url = ' http://searchex.yixun.com/html?YTAG=3.705766287001&path=705882t705893 '
req = urllib2.urlsplit(all_url)
html = req.read()
savefile(html)
dom = etree.HTML(html.decode( ' utf-8 '))
p = dom.xpath(u " //div[@title='品牌'] ")[0]
ss= p.xpath( ' ..//dd/a ')
print ss[0].text.encode( ' utf-8 ')
except:
traceback.print_exc()
def craw_start():
import re
try:
all_url = ' http://searchex.yixun.com/?YTAG=2.1738456090000 '
req = urllib2.urlopen(all_url)
html = req.read()
# group = re.search("window\.yPageId ='(.*?)'",html)
yPageId = re.search( " window\.yPageId\s*=\s*'(\d+?)' ",html).group(1)
yPageLevel = re.search( " window\.yPageLevel\s*=\s*'(\d+?)' ",html).group(1)
print yPageId,yPageLevel
dom = etree.HTML(html.decode( ' gb2312 '))
all_cats=[]
cat1_list = dom.xpath( " //div[@class='m_classbox'] ")
for cat in cat1_list:
cat1_text = cat.xpath( ' h3/text() ')[0]
cat1_e = ResourceItem(cat1_text)
all_cats.append(cat1_e)
print cat1_e.text.encode( ' utf-8 ')
div = cat.xpath( " div ")[0]
for dl in div.xpath( ' dl '):
cat2 = dl.xpath( ' dt/a ')[0]
cat2_e = ResourceItem(cat2.text,href=cat2.attrib[ ' href '],tag=cat2.attrib[ ' ytag '],parent=cat1_e)
cat1_e.children.append(cat2_e)
print ' '*4,cat1_e.text.encode( ' utf-8 '),cat2_e.href,cat2_e.tag
for cat3 in dl.xpath( ' dd/a '):
cat3_e = ResourceItem(cat3.text,href=cat3.attrib[ ' href '],tag=cat3.attrib[ ' ytag '],parent=cat2_e)
cat2_e.children.append(cat3_e)
print ' '*8,cat3_e.text.encode( ' utf-8 '),cat3_e.href,cat3_e.tag
tasks =[]
for e1 in all_cats:
print ' - '*1,e1.text.encode( ' utf-8 ')
for e2 in e1.children:
print ' '*2 ,e2.text.encode( ' utf-8 ')
for e3 in e2.children:
print ' '*4,e3.text.encode( ' utf-8 ')
task = gevent.spawn(scrape_cat,e3,yPageId,yPageLevel,e2.tag,e1.text.encode( ' utf-8 '),e2.text.encode( ' utf-8 '),e3.text.encode( ' utf-8 '))
tasks.append(task)
# scrape_cat(e3,yPageId,yPageLevel,e2.tag,e1.text.encode('utf-8'),e2.text.encode('utf-8'),e3.text.encode('utf-8'))
# return
gevent.joinall(tasks)
except:
traceback.print_exc()
if __name__ == ' __main__ ':
craw_start()
# test()
pass
1. 商品图像信息加入pgsql, 字段类型: model.BinaryField (django 1.6)
2. django1.6提供数据库长连接功能 CONN_MAX_AGE 参数,可加快访问速度
3. gevent.spawn() 创建所有task,将导致进程被直接Kill掉,因为内存疯长,所涉及的资源达到顶端将被OS杀死,包括:内存overload,file handle...
使用 gevent.pool.Pool控制并发数量
4. postgresql.conf 修改max_connections参数到300并发数
5. 导入数据时,可先去除Table的索引,加快插入速度
6 . django关闭DEBUG模式(会导致内存泄露不释放)
# --coding:utf-8--
'''
yixun_crawler - 易迅网站商品信息收集器
author: scott
date:
目前仅考虑单点设备登录
lxml 的xpath还存在部分问题(还是自己没完全领会呢?)
source-code and db-sample:
http://114.215.178.29/static/projects/crawler/
'''
http://114.215.178.29/static/projects/crawler/
import imp
imp.load_source( ' init ', ' ../init_script.py ')
import gevent
import os,os.path,sys,struct,time,traceback,signal,threading,copy,base64,datetime
# from django.db import connection
# from django.db.models import Sum
from django.db import transaction
import yixun.models as yixun
from bson.objectid import ObjectId
import lxml.etree as etree
import urllib2,urlparse
dbfile = ' goods.txt '
fdbfile = open(dbfile, ' wb ')
class ResourceItem:
def __init__(self,text,href=None,tag=None,parent=None):
self.text = text
self.tag=tag
self.href=href
self.children=[]
self.parent = parent
def scrape_page(url,pageIndex,cat1,cat2,cat3):
print ' scrape_page: ',url
req = urllib2.urlopen(url)
data = req.read()
# savefile(data)
html = etree.HTML(data.decode( ' utf-8 '))
# page size
curPage = 0
r = html.xpath( ' //*[@id="list"]/div[5]/div[2]/span/b/text() ')
if not r: return False
curPage = r[0]
r = html.xpath( ' //*[@id="list"]/div[5]/div[2]/span/text() ')
if not r : return False
pageNum = int(r[0][1:])
print pageNum,curPage
# 有一种情况,传入大于总page数量的值,server会返回第一个page
if pageIndex > pageNum:
return False
# 检索品牌
goods = html.xpath(u " //div[@class='mod_goods'] ")
if not goods:
print ' skipped.. '
return False
for g in goods:
for e in g.getchildren():
if e.get( ' class ') == ' mod_goods_info ': # 一下search动作用xpath无法实现,所以只好挨个查找
name = ''
price =None
link = ''
for p in e.getchildren():
if p.get( ' class ')== ' mod_goods_tit ':
a= p.getchildren()[0]
name = a.text.encode( ' utf-8 ')
link = a.get( ' href ')
if p.get( ' class ')== ' mod_goods_price ':
price = p.getchildren()[0].getchildren()[1].text.encode( ' utf-8 ')
if name and price and link:
# print name , price ,link
text = " %s || %s || %s || %s || %s || %s\n "%(cat1,cat2,cat3,name,price,link.strip())
print text
gitem = yixun.GoodsItem()
gitem.cat1 = cat1
gitem.cat2 = cat2
gitem.cat3 = cat3
gitem.name = name
gitem.cat5 = link
try:
gitem.price = float(price)
except:
pass
gitem.save()
# fdbfile.write(text)
# fdbfile.flush()
return True
# ss= p.xpath('..//dd/a')
'''
http://searchex.yixun.com/705740t705741-1-/?YTAG=2.1738456040037
http://searchex.yixun.com/html?path=705740t705741&area=1&sort=0&show=0&page=2&size=40&pf=0&as=0&charset=utf-8&YTAG=2.1738456040037#list
http://searchex.yixun.com/html?path=705740t705741&area=1&sort=0&show=0&page=1&size=40&pf=0&as=0&charset=utf-8&YTAG=2.1738456040037#list
'''
def scrape_cat(cat,yPageId,yPageLevel,tag,cat1,cat2,cat3):
try:
print cat.href
# parse url
url = cat.href
fs = urlparse.urlparse(url)
path,qs=fs[2],fs[4]
cat_idx = path[1:].split( ' - ')[0]
# tag = qs.split('=')[1]
tag = " %s.%s%s "%(yPageLevel,yPageId,tag)
# make path url
for page in range(1,500):
url = " http://searchex.yixun.com/html?path=%s&area=1&sort=0&show=0&page=%s&size=40&pf=0&as=0&charset=utf-8&YTAG=%s#list "%(cat_idx,page,tag)
if not scrape_page(url,page,cat1,cat2,cat3):
break
return
except:
traceback.print_exc()
# print 'page is null,skipped..'
def savefile(d,filename= ' sample.html '):
f = open(filename, ' w ')
f.write(d)
f.close()
def test():
try:
url = ' http://searchex.yixun.com/705740t705741-1-/?YTAG=2.1738456040037 '
fs = urlparse.urlparse(url)
path,qs=fs[2],fs[4]
cat_idx = path[1:].split( ' - ')[0]
tag = qs.split( ' = ')[1]
print cat_idx,tag
return
all_url = ' http://searchex.yixun.com/html?YTAG=3.705766287001&path=705882t705893 '
req = urllib2.urlsplit(all_url)
html = req.read()
savefile(html)
dom = etree.HTML(html.decode( ' utf-8 '))
p = dom.xpath(u " //div[@title='品牌'] ")[0]
ss= p.xpath( ' ..//dd/a ')
print ss[0].text.encode( ' utf-8 ')
except:
traceback.print_exc()
def craw_start():
import re
try:
all_url = ' http://searchex.yixun.com/?YTAG=2.1738456090000 '
req = urllib2.urlopen(all_url)
html = req.read()
# group = re.search("window\.yPageId ='(.*?)'",html)
yPageId = re.search( " window\.yPageId\s*=\s*'(\d+?)' ",html).group(1)
yPageLevel = re.search( " window\.yPageLevel\s*=\s*'(\d+?)' ",html).group(1)
print yPageId,yPageLevel
dom = etree.HTML(html.decode( ' gb2312 '))
all_cats=[]
cat1_list = dom.xpath( " //div[@class='m_classbox'] ")
for cat in cat1_list:
cat1_text = cat.xpath( ' h3/text() ')[0]
cat1_e = ResourceItem(cat1_text)
all_cats.append(cat1_e)
print cat1_e.text.encode( ' utf-8 ')
div = cat.xpath( " div ")[0]
for dl in div.xpath( ' dl '):
cat2 = dl.xpath( ' dt/a ')[0]
cat2_e = ResourceItem(cat2.text,href=cat2.attrib[ ' href '],tag=cat2.attrib[ ' ytag '],parent=cat1_e)
cat1_e.children.append(cat2_e)
print ' '*4,cat1_e.text.encode( ' utf-8 '),cat2_e.href,cat2_e.tag
for cat3 in dl.xpath( ' dd/a '):
cat3_e = ResourceItem(cat3.text,href=cat3.attrib[ ' href '],tag=cat3.attrib[ ' ytag '],parent=cat2_e)
cat2_e.children.append(cat3_e)
print ' '*8,cat3_e.text.encode( ' utf-8 '),cat3_e.href,cat3_e.tag
tasks =[]
for e1 in all_cats:
print ' - '*1,e1.text.encode( ' utf-8 ')
for e2 in e1.children:
print ' '*2 ,e2.text.encode( ' utf-8 ')
for e3 in e2.children:
print ' '*4,e3.text.encode( ' utf-8 ')
task = gevent.spawn(scrape_cat,e3,yPageId,yPageLevel,e2.tag,e1.text.encode( ' utf-8 '),e2.text.encode( ' utf-8 '),e3.text.encode( ' utf-8 '))
tasks.append(task)
# scrape_cat(e3,yPageId,yPageLevel,e2.tag,e1.text.encode('utf-8'),e2.text.encode('utf-8'),e3.text.encode('utf-8'))
# return
gevent.joinall(tasks)
except:
traceback.print_exc()
if __name__ == ' __main__ ':
craw_start()
# test()
pass