需求
*爬取若干页上的商品
*将网页链接保存到MongoDB数据库中
*能获取相应商品的详细信息,例如:标题、发布时、价格、浏览量等
开发环境
Win10 64bit、Python3.7、PyCharm
代码
1.channel_extract.py(为了获取分页的链接)
# Author:ZhouChuang
# coding:utf-8
from bs4 import BeautifulSoup
import requests
url_host = 'http://nb.58.com'
start_url ='http://nb.58.com/sale.shtml'
def get_channel_urls(url):
wb_data = requests.get(start_url)
soup = BeautifulSoup(wb_data.text,'lxml')
links = soup.select('ul.ym-submnu > li > b > a')
for link in links:
page_url = url_host + link.get('href')
print(page_url)
get_channel_urls(start_url)
channel_list = '''
http://nb.58.com/shouji/
http://nb.58.com/danche/
http://nb.58.com/diandongche/
http://nb.58.com/diannao/
http://nb.58.com/bijiben/
http://nb.58.com/pbdn/
http://nb.58.com/diannaopeijian/
http://nb.58.com/zhoubianshebei/
http://nb.58.com/shuma/
http://nb.58.com/shumaxiangji/
http://nb.58.com/mpsanmpsi/
http://nb.58.com/youxiji/
http://nb.58.com/ershoukongtiao/
http://nb.58.com/dianshiji/
http://nb.58.com/xiyiji/
http://nb.58.com/bingxiang/
http://nb.58.com/jiadian/
http://nb.58.com/binggui/
http://nb.58.com/chuang/
http://nb.58.com/ershoujiaju/
http://nb.58.com/yingyou/
http://nb.58.com/yingeryongpin/
http://nb.58.com/muyingweiyang/
http://nb.58.com/muyingtongchuang/
http://nb.58.com/yunfuyongpin/
http://nb.58.com/fushi/
http://nb.58.com/nanzhuang/
http://nb.58.com/fsxiemao/
http://nb.58.com/xiangbao/
http://nb.58.com/meirong/
http://nb.58.com/tushu/
http://nb.58.com/tushubook/
http://nb.58.com/wenti/
http://nb.58.com/yundongfushi/
http://nb.58.com/jianshenqixie/
http://nb.58.com/huju/
http://nb.58.com/qiulei/
http://nb.58.com/yueqi/
http://nb.58.com/kaquan/
http://nb.58.com/bangongshebei/
http://nb.58.com/diannaohaocai/
http://nb.58.com/bangongjiaju/
http://nb.58.com/ershoushebei/
http://nb.58.com/chengren/
'''
2. page_parsing.py(获取每个页面的商品链接保存到mongodb,分析每个商品的标题、发布时、价格、地区信息)
# Author:ZhouChuang
# coding:utf-8
from bs4 import BeautifulSoup
import requests,time
import pymongo
client = pymongo.MongoClient('localhost',27017)
ceshi = client['ceshi']
url_list = ceshi['url_list3']
item_info = ceshi['item_info3']
#spider 1
def get_links_from(channel,pages,who_sells=0):
list_view = '{}{}/pn{}/'.format(channel,str(who_sells),str(pages))
wb_data = requests.get(list_view)
time.sleep(1)
soup = BeautifulSoup(wb_data.text,'lxml')
if soup.find('td','t'):
for link in soup.select('td.t a.t'):
item_link = link.get('href').split('?')[0]
url_list.insert_one({'url':item_link})
print(item_link)
else:
pass
def get_item_info(url):
wb_data = requests.get(url)
soup = BeautifulSoup(wb_data.text,'lxml')
no_longer_exist = '404' in soup.find('script', type='text/javascript').get('src').split('/')
if no_longer_exist:
pass
else:
title = soup.select('#content > div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.mainTitle > h1')[0].text
price = soup.select('span.price.c_f50')[0].text
date = soup.select('.time')[0].text if soup.find_all('.time') else None
area = list(soup.select('.c_25d a')[0].stripped_strings) if soup.find_all('span','c_25d') else None
item_info.insert_one({'title': title, 'price': price, 'date': date, 'area': area})
print({'title':title,'price':price,'date':date,'area':area})
# get_links_from('http://nb.58.com/diannao/',2)
# get_item_info('http://sjz.58.com/shouji/35059087559756x.shtml')
3.mian.py(运行程序)
# Author:ZhouChuang
# coding:utf-8
from multiprocessing import Pool
from channel_extract import channel_list
from page_parsing import get_links_from
def get_all_links_from(channel):
for num in range(1,101):
get_links_from(channel,num)
if __name__ =='__main__':
pool = Pool() #创建进程池
pool.map(get_all_links_from,channel_list.split())
4.counts.py(用于计数)
import time
from page_parsing import url_list
while True:
print(url_list.find().count())
time.sleep(5)
运行结果
一共爬取了32703条商品链接