思路:有多个频道(类别),每个频道下有多个商品链接,每个商品都有详情页。先将频道链接中的多个商品链接爬下来放入数据库中,再从数据库中取出来每一个商品详情页链接,进行详情页中的信息爬取
首先是channel_extact.py,爬取不同频道的链接
from bs4 import BeautifulSoup
import requests
start_url = 'http://bj.58.com/sale.shtml'
url_host = 'http://bj.58.com'
def get_index_url(url): #提取导航栏的链接,不同频道有不同的页面列表
# url = start_url
wb_data = requests.get(url)
soup = BeautifulSoup(wb_data.text, 'lxml')
links = soup.select('ul.ym-submnu > li > b > a') #抓取所有导航栏的链接
# print(links)
for link in links:
page_url = url_host + link.get('href') #补全链接
print(page_url)
get_index_url(start_url)
#长字符串,去除了手机号
channel_list = '''
http://bj.58.com/shouji/
http://bj.58.com/tongxunyw/
http://bj.58.com/diannao/
http://bj.58.com/bijiben/
http://bj.58.com/pbdn/
http://bj.58.com/diannaopeijian/
http://bj.58.com/zhoubianshebei/
http://bj.58.com/shuma/
http://bj.58.com/shumaxiangji/
http://bj.58.com/mpsanmpsi/
http://bj.58.com/youxiji/
http://bj.58.com/jiadian/
http://bj.58.com/dianshiji/
http://bj.58.com/ershoukongtiao/
http://bj.58.com/xiyiji/
http://bj.58.com/bingxiang/
http://bj.58.com/binggui/
http://bj.58.com/chuang/
http://bj.58.com/ershoujiaju/
http://bj.58.com/yingyou/
http://bj.58.com/yingeryongpin/
http://bj.58.com/muyingweiyang/
http://bj.58.com/muyingtongchuang/
http://bj.58.com/yunfuyongpin/
http://bj.58.com/fushi/
http://bj.58.com/nanzhuang/
http://bj.58.com/fsxiemao/
http://bj.58.com/xiangbao/
http://bj.58.com/meirong/
http://bj.58.com/yishu/
http://bj.58.com/shufahuihua/
http://bj.58.com/zhubaoshipin/
http://bj.58.com/yuqi/
http://bj.58.com/tushu/
http://bj.58.com/tushubook/
http://bj.58.com/wenti/
http://bj.58.com/yundongfushi/
http://bj.58.com/jianshenqixie/
http://bj.58.com/huju/
http://bj.58.com/qiulei/
http://bj.58.com/yueqi/
http://bj.58.com/bangongshebei/
http://bj.58.com/diannaohaocai/
http://bj.58.com/bangongjiaju/
http://bj.58.com/ershoushebei/
http://bj.58.com/danche/
http://bj.58.com/fzixingche/
http://bj.58.com/diandongche/
http://bj.58.com/sanlunche/
http://bj.58.com/peijianzhuangbei/
http://bj.58.com/tiaozao/
'''
然后是pages_parsing.py,两个爬虫一个将频道链接下的所有商品链接放入数据库中,一个是将详情页中的信息爬取放入数据库中
from bs4 import BeautifulSoup
import requests
import time
import pymongo
#将抓取的channel中的数据放入数据库,然后从数据中取出每一个详情页的链接,用spider2进行爬取,然后放入数据库
client = pymongo.MongoClient('localhost', 27017)
ceshi = client['ceshi'] #创建名称
url_list = ceshi['url_list4'] #创建表
item_info = ceshi['item_info4'] #创建表用于存放每一个详情页中的信息
# 在最左边是在python 中对象的名称,后面的是在数据库中的名称
# spider 1,把一个类目下所有的商品的链接抓下来,但只能获取指定的一页
def get_links_from(channel, pages, who_sells=1):
# td.t 没有这个(每个商品是一行)就终止,页数爬完了
# http://bj.58.com/diannao/pn2/
list_view = '{}{}/pn{}/'.format(channel, str(who_sells), str(pages))
wb_data = requests.get(list_view)
time.sleep(1)
soup = BeautifulSoup(wb_data.text, 'lxml')
if soup.find('td', 't'): #考虑到怎么判断爬完
for link in soup.select('td.t a.t'): #将页面中的商品链接循环放入数据库
item_link = link.get('href').split('?')[0]
url_list.insert_one({'url': item_link}) #将链接插入数据库中
print(item_link)
# get_links_from('http://bj.58.com/shuma/',2)
# spider 2,爬取商品详情页信息,如何识别404页面
def get_item_info(url):
wb_data = requests.get(url)
soup = BeautifulSoup(wb_data.text, 'lxml')
no_longer_exist = '404' in soup.find('script',type = "text/javascript").get('src').split('/')
if no_longer_exist: #判断是否存在404页面
pass
else:
title = soup.title.text
price = soup.select('span.price.c_f50')
date = soup.select('.time')
area = soup.select('.c_25d a') if soup.find_all('span','c_25d') else None #有些没有地址
areas =[] #该链表处理多级地址,有待改进
for are in area:
areas.append(are.get_text())
areas2 =[] #该链表处理多级地址,有待改进
for i in range(0,len(areas)-1):
areas2.append(areas[i]+'-'+areas[i+1])
for tit, pric, dat, are in zip(title,price,date,area):
data = {
'price':pric.get_text(),
'date':dat.get_text(),
}
data['title'] = title
data['area'] = areas2 #data中怎么存放列表
data['url'] = url
item_info.insert_one(data)
print(data)
# urls = [get_links_from('http://bj.58.com/shouji/',3)]
# for url in urls:
# get_item_info(url)
url1 = 'http://bj.58.com/shuma/29075926847818x.shtml'
get_item_info(url1)
主函数:main.py,实现多进程爬取
from multiprocessing import Pool #可以调用电脑的cpu的多个内核完成任务,多进程的导入
from channel_extact import channel_list
from page
s_parsing import get_links_from
def get_all_links_from(channel): #获取多页,将每一页的数据导入数据库
for i in range(1,100):
get_links_from(channel,i)
if __name__ == '__main__': #固定格式
pool = Pool() #创建一个进程池,将程序塞进池子里,就会进行分配cpu,有参数控制进程个数,默认就自动分配
pool.map(get_all_links_from,channel_list.split()) #map就是将channel_list中的一个一个放入get_all_links_from中
一个统计数据库中爬取的商品数量
import time
from pages_parsing import url_list
while True:
print(url_list.find().count())
time.sleep(5)