Python实战作业 第二周:爬取赶集网商品信息

任务:

1、从网站:http://sh.ganji.com/wu/ 中获取类型链接

2、从类型地址中获取商品链接(设置末尾页判断,以防爬取错误信息)

3、从商品地址中获取商品信息:商品标题—goods_title、价格—price、交易地点—swap_site

成果:

商品链接

Python实战作业 第二周:爬取赶集网商品信息_第1张图片
Snip20170531_5.png

商品详情

Python实战作业 第二周:爬取赶集网商品信息_第2张图片
Snip20170531_6.png

多进程

Python实战作业 第二周:爬取赶集网商品信息_第3张图片
Snip20170531_7.png

代码:

第一部分:

from bs4 import BeautifulSoup
import requests

'''
start_url = 'http://sh.ganji.com/wu/'
url = 'http://sh.ganji.com'
wb_data = requests.get(start_url)
soup = BeautifulSoup(wb_data.text,'lxml')

channel_links = soup.select('#wrapper > div.content > div > div > dl > dt > a')

for channel_link in channel_links:
    link = url+channel_link.get('href')
    print(link)
'''
channels = '''
    http://sh.ganji.com/jiaju/
    http://sh.ganji.com/rirongbaihuo/
    http://sh.ganji.com/shouji/
    http://sh.ganji.com/bangong/
    http://sh.ganji.com/nongyongpin/
    http://sh.ganji.com/jiadian/
    http://sh.ganji.com/ershoubijibendiannao/
    http://sh.ganji.com/ruanjiantushu/
    http://sh.ganji.com/yingyouyunfu/
    http://sh.ganji.com/diannao/
    http://sh.ganji.com/xianzhilipin/
    http://sh.ganji.com/fushixiaobaxuemao/
    http://sh.ganji.com/meironghuazhuang/
    http://sh.ganji.com/shuma/
    http://sh.ganji.com/laonianyongpin/
    http://sh.ganji.com/xuniwupin/
    http://sh.ganji.com/qitawupin/
    http://sh.ganji.com/ershoufree/
    http://sh.ganji.com/wupinjiaohuan/
    '''

第二部分:

from bs4 import BeautifulSoup
import requests
import pymongo
import time


client = pymongo.MongoClient('localhost',27017)
ganji = client['ganji']
sheet_urls = ganji['sheet_urls']
sheet_info = ganji['sheet_info']

urls = ['http://sh.ganji.com/jiaju/o{}'.format(i) for i in range(57,63)]
url = 'http://zhuanzhuan.ganji.com/detail/788991434206199812z.shtml'



def get_url(url):
    wb_data = requests.get(url)
    soup = BeautifulSoup(wb_data.text,'lxml')
    time.sleep(2)
    goods_urls = soup.select('#infolist > div > table > tbody > tr > td.t > a')
    infos = soup.select('#infolist > div')
    for info in infos:
        if info.get('class') != ['noinfo']:
            for goods_url in goods_urls:
                if goods_url.get('href').split('/')[2]!='sh.ganji.com':
                    sheet_urls.insert_one({'url':goods_url.get('href').split('?')[0]})
                    print(goods_url.get('href').split('?')[0])
        else:
            print('End')

def get_info(url):
    wb_data = requests.get(url)
    soup = BeautifulSoup(wb_data.text,'lxml')
    time.sleep(2)
    if soup.title.text.split() == ['【图】_的闲置物品-转转,赶集二手']:
        pass
    else:
        goods_title = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.box_left_top > h1')[0].text
        #times = soup.select('')
        #types = soup.select('')
        price = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.info_massege.left > div.price_li > span > i')[0].text
        area = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.info_massege.left > div.palce_li > span > i')[0].text
        #degrees = soup.select('')
        print(goods_title,price,area)
        sheet_info.insert_one({'goods_title':goods_title,'price':price,'area':area})

第三部分:

from multiprocessing import Pool
from get_urls import get_url,get_info
import pymongo
from get_channel import channels


client = pymongo.MongoClient('localhost',27017)
ganji = client['ganji']
sheet_urls = ganji['sheet_urls']
sheet_info = ganji['sheet_info']

def get_all_links(channel):
    for n in range(1,11):
        url = channel + 'o{}'.format(str(n))
        get_url(url)


pool = Pool(4)

#Spader 1 采用多进程方式,获取所有商品链接地址
pool.map(get_all_links,channels.split())

#Spader 2 采用多进程方式,获取所有商品详情
for i in sheet_urls.find():
    url = i['url']
    pool.apply_async(get_info, args=(url,))
pool.close()
pool.join()

你可能感兴趣的:(Python实战作业 第二周:爬取赶集网商品信息)