易迅商品抓取

易迅商品抓取
几个性能问题: 
  1. 商品图像信息加入pgsql, 字段类型: model.BinaryField  (django 1.6) 
  2. django1.6提供数据库长连接功能  
CONN_MAX_AGE 参数,可加快访问速度
  3. gevent.spawn() 创建所有task,将导致进程被直接Kill掉,因为内存疯长,所涉及的资源达到顶端将被OS杀死,包括:内存overload,file handle...
      使用 gevent.pool.Pool控制并发数量
   4. postgresql.conf 修改max_connections参数到300并发数
  5. 导入数据时,可先去除Table的索引,加快插入速度 
  
6 . django关闭DEBUG模式(会导致内存泄露不释放) 





#
--coding:utf-8--


'''
    yixun_crawler - 易迅网站商品信息收集器
    author: scott
    date:

    目前仅考虑单点设备登录

    lxml 的xpath还存在部分问题(还是自己没完全领会呢?)

    source-code and db-sample:
           http://114.215.178.29/static/projects/crawler/
'''

import imp
imp.load_source( ' init ', ' ../init_script.py ')
import gevent
import os,os.path,sys,struct,time,traceback,signal,threading,copy,base64,datetime

#  from django.db import connection
#
 from django.db.models import Sum
from django.db  import transaction

import yixun.models as  yixun

from bson.objectid  import ObjectId
import lxml.etree as etree

import urllib2,urlparse

dbfile =  ' goods.txt '

fdbfile = open(dbfile, ' wb ')

class ResourceItem:
     def  __init__(self,text,href=None,tag=None,parent=None):
        self.text = text
        self.tag=tag
        self.href=href
        self.children=[]
        self.parent = parent


def scrape_page(url,pageIndex,cat1,cat2,cat3):
     print  ' scrape_page: ',url

    req = urllib2.urlopen(url)
    data = req.read()
     #  savefile(data)
    html = etree.HTML(data.decode( ' utf-8 '))

     # page size

    curPage = 0
    r = html.xpath( ' //*[@id="list"]/div[5]/div[2]/span/b/text() ')
     if  not r:  return False
    curPage = r[0]
    r = html.xpath( ' //*[@id="list"]/div[5]/div[2]/span/text() ')
     if  not r :  return False
    pageNum = int(r[0][1:])
     print pageNum,curPage

     # 有一种情况,传入大于总page数量的值,server会返回第一个page

     if pageIndex > pageNum:
         return False


     # 检索品牌
    goods = html.xpath(u " //div[@class='mod_goods'] ")
     if  not goods:
         print  ' skipped.. '
         return False
     for g  in goods:
         for e  in g.getchildren():
             if e.get( ' class ') ==   ' mod_goods_info ':     # 一下search动作用xpath无法实现,所以只好挨个查找
                name =  ''
                price =None
                link =  ''
                 for  p  in e.getchildren():
                     if p.get( ' class ')== ' mod_goods_tit ':
                        a= p.getchildren()[0]
                        name =  a.text.encode( ' utf-8 ')
                        link = a.get( ' href ')

                     if p.get( ' class ')== ' mod_goods_price ':
                        price = p.getchildren()[0].getchildren()[1].text.encode( ' utf-8 ')
                 if name  and price  and link:
                     #  print name , price ,link
                    text =  " %s || %s || %s || %s || %s || %s\n "%(cat1,cat2,cat3,name,price,link.strip())

                     print text

                    gitem = yixun.GoodsItem()
                    gitem.cat1 = cat1
                    gitem.cat2 = cat2
                    gitem.cat3 = cat3
                    gitem.name = name
                    gitem.cat5 =  link

                     try:
                        gitem.price = float(price)
                     except:
                         pass
                    gitem.save()

                     #  fdbfile.write(text)
                     #  fdbfile.flush()


     return True

     #  ss= p.xpath('..//dd/a')

'''
http://searchex.yixun.com/705740t705741-1-/?YTAG=2.1738456040037
http://searchex.yixun.com/html?path=705740t705741&area=1&sort=0&show=0&page=2&size=40&pf=0&as=0&charset=utf-8&YTAG=2.1738456040037#list
http://searchex.yixun.com/html?path=705740t705741&area=1&sort=0&show=0&page=1&size=40&pf=0&as=0&charset=utf-8&YTAG=2.1738456040037#list
'''
def scrape_cat(cat,yPageId,yPageLevel,tag,cat1,cat2,cat3):
     try:
         print cat.href
         # parse url
        url = cat.href
        fs =  urlparse.urlparse(url)
        path,qs=fs[2],fs[4]
        cat_idx =  path[1:].split( ' - ')[0]
         #  tag = qs.split('=')[1]
        tag =  " %s.%s%s "%(yPageLevel,yPageId,tag)
         # make path url
         for page  in range(1,500):
            url =  " http://searchex.yixun.com/html?path=%s&area=1&sort=0&show=0&page=%s&size=40&pf=0&as=0&charset=utf-8&YTAG=%s#list "%(cat_idx,page,tag)
             if  not scrape_page(url,page,cat1,cat2,cat3):
                 break

         return


     except:
        traceback.print_exc()
         #  print 'page is null,skipped..'

def savefile(d,filename= ' sample.html '):
    f = open(filename, ' w ')
    f.write(d)
    f.close()

def test():
     try:
        url =  ' http://searchex.yixun.com/705740t705741-1-/?YTAG=2.1738456040037 '
        fs =  urlparse.urlparse(url)
        path,qs=fs[2],fs[4]
        cat_idx =  path[1:].split( ' - ')[0]
        tag = qs.split( ' = ')[1]
         print cat_idx,tag

         return

        all_url =  ' http://searchex.yixun.com/html?YTAG=3.705766287001&path=705882t705893 '
        req = urllib2.urlsplit(all_url)
        html = req.read()
        savefile(html)

        dom = etree.HTML(html.decode( ' utf-8 '))
        p = dom.xpath(u " //div[@title='品牌'] ")[0]
        ss= p.xpath( ' ..//dd/a ')
         print ss[0].text.encode( ' utf-8 ')

     except:
        traceback.print_exc()

def craw_start():
     import re
     try:
        all_url =  ' http://searchex.yixun.com/?YTAG=2.1738456090000 '
        req = urllib2.urlopen(all_url)
        html = req.read()

         #  group = re.search("window\.yPageId ='(.*?)'",html)
        yPageId = re.search( " window\.yPageId\s*=\s*'(\d+?)' ",html).group(1)
        yPageLevel = re.search( " window\.yPageLevel\s*=\s*'(\d+?)' ",html).group(1)
         print yPageId,yPageLevel

        dom = etree.HTML(html.decode( ' gb2312 '))
        all_cats=[]
        cat1_list = dom.xpath( " //div[@class='m_classbox'] ")
         for cat  in cat1_list:
            cat1_text = cat.xpath( ' h3/text() ')[0]
            cat1_e = ResourceItem(cat1_text)
            all_cats.append(cat1_e)
             print cat1_e.text.encode( ' utf-8 ')
            div = cat.xpath( " div ")[0]
             for dl  in  div.xpath( ' dl '):
                cat2 = dl.xpath( ' dt/a ')[0]
                cat2_e = ResourceItem(cat2.text,href=cat2.attrib[ ' href '],tag=cat2.attrib[ ' ytag '],parent=cat1_e)
                cat1_e.children.append(cat2_e)
                 print  '   '*4,cat1_e.text.encode( ' utf-8 '),cat2_e.href,cat2_e.tag
                 for cat3  in dl.xpath( ' dd/a '):
                    cat3_e = ResourceItem(cat3.text,href=cat3.attrib[ ' href '],tag=cat3.attrib[ ' ytag '],parent=cat2_e)
                    cat2_e.children.append(cat3_e)
                     print  '   '*8,cat3_e.text.encode( ' utf-8 '),cat3_e.href,cat3_e.tag
        tasks =[]
         for e1  in all_cats:
             print  ' - '*1,e1.text.encode( ' utf-8 ')
             for e2  in e1.children:
                 print  '    '*2    ,e2.text.encode( ' utf-8 ')
                 for e3  in e2.children:
                     print  '    '*4,e3.text.encode( ' utf-8 ')
                    task = gevent.spawn(scrape_cat,e3,yPageId,yPageLevel,e2.tag,e1.text.encode( ' utf-8 '),e2.text.encode( ' utf-8 '),e3.text.encode( ' utf-8 '))
                    tasks.append(task)
                     #  scrape_cat(e3,yPageId,yPageLevel,e2.tag,e1.text.encode('utf-8'),e2.text.encode('utf-8'),e3.text.encode('utf-8'))
                     #  return
        gevent.joinall(tasks)
     except:
        traceback.print_exc()


if  __name__ ==  ' __main__ ':
    craw_start()
     #  test()
     pass

你可能感兴趣的:(易迅商品抓取)