Python爬虫实战笔记_2-3 多进程爬虫

练习Pool的使用

源代码

main.py

#!/usr/bin/python
# -*- coding: UTF-8 -*

#  multiprocess

from multiprocessing import Pool
import time

from urlhandler import insert_urls_by_nav, get_nav_urls
from mongoconn import mongoset

table = mongoset('58sale', 'itemurls')

if __name__ == '__main__':
    starttime = time.time()
    print ('start: ')
    print (time.strftime('%Y-%m-%d %H:%M:%S'))

    pool = Pool()

    url = 'http://sh.58.com/sale.shtml'
    navurls = get_nav_urls(url)

    pool.map(insert_urls_by_nav, navurls)

    endtime = time.time()
    print (time.strftime('%Y-%m-%d %H:%M:%S'))
    elapsed = endtime - starttime

urlhandler.py

#!usr/bin/env python
#_*_ coding: utf-8 _*_
#
#  functions to get item urls

from bs4 import BeautifulSoup
import requests
import time

from mongoconn import mongoset, mongoinsert


def get_soup(url):
    source = requests.get(url)
    soup = BeautifulSoup(source.text, 'lxml')
    return soup

def combineurls(url, page):
    pageurls = []
    for i in range(1, page+1):
        pageurl = '{}{}/'.format(url, i)
        pageurls.append(pageurl)
    return pageurls

def get_nav_urls(url):
    soup = get_soup(url)
    navlist = soup.select('ul.ym-mainmnu span.dlb > a')
    absurls = []
    for submnu in navlist:
        try:
            absurl = url[0:-11] + submnu.get('href')
        except TypeError:
            pass
        except:
            pass
        if absurl not in absurls:
            absurls.append(absurl)
    return absurls

def get_page_urls(url):
    #  get urls with pages id
    urls = combineurls(url + 'pn', 70)
    return urls

def get_page_urls_bk(url):
    curpage = 1
    maxpage=0
    while curpage > maxpage:
        maxpage = curpage
        pageurl = url + 'pn' + str(maxpage)
        soup = get_soup(pageurl)
        pager = soup.select('div.pager > a')
        pagenum = pager[len(pager)-3].select('span')[0].get_text() #### -3是临时办法, 需要再想想
        curpage = int(pagenum)
    urls = combineurls(url+'pn', maxpage)
    return urls

def listtodict(urls):
    datamany = []
    for itemurl in urls:
        data = {
            'itemurl': itemurl
        }
        datamany.append(data)
    return datamany

def get_item_urls(url):
    soup = get_soup(url)
    print(url)
    itemlist = soup.select('tr.zzinfo > td.img > a')
    itemurls = []
    if len(itemlist):
        for item in itemlist:
            try:
                itemurl = item.get('href')
            except:
                pass
            itemurls.append(itemurl)
    #time.sleep(1)
    return itemurls

def getemtext(element):
    return element.get_text().strip().replace('\t', '').replace('\n', '').replace(' ','')

def get_urls_by_nav(navurl):
    navurls = get_page_urls(navurl)
    for pageurl in navurls:
        itemurls = get_item_urls(pageurl)
        mongoinsert(table, listtodict(itemurls))

table = mongoset('58sale', 'itemurls')

def insert_urls_by_nav(navurl):
    navurls = get_page_urls(navurl)
    for pageurl in navurls:
        itemurls = get_item_urls(pageurl)
        #mongoinsert(table, listtodict(itemurls))
        if itemurls:
            table.insert_many(listtodict(itemurls))

if __name__ == '__main__':
    url = 'http://sh.58.com/sale.shtml'
    get_nav_urls(url)

运行结果
2016-07-02 15:55:10
...
http://sh.58.com/shoujihao/pn4/
http://sh.58.com/shoujihao/pn5/
http://sh.58.com/danche/pn2/
http://sh.58.com/zixingche/pn2/
http://sh.58.com/shoujihao/pn6/
http://sh.58.com/shouji/pn2/
http://sh.58.com/shoujihao/pn7/
http://sh.58.com/shoujihao/pn8/
http://sh.58.com/danche/pn3/
http://sh.58.com/zixingche/pn3/
http://sh.58.com/shoujihao/pn9/
http://sh.58.com/shouji/pn3/
http://sh.58.com/shoujihao/pn10/
http://sh.58.com/shoujihao/pn11/
...
2016-07-02 15:57:38
总结
  • 从输出结果看各类目的爬取是按照页面顺序进行的,但类目之间是并行的。另外shoujihao类目下面由于没有目标信息,处理的比别的类目都要快。
  • mongo shell 中查看共抓到了41650个url
> db.itemurls.count()
41650
  • 记录了运行的起始时间, 抓取41650个url共用时2分28秒

你可能感兴趣的:(Python爬虫实战笔记_2-3 多进程爬虫)