python爬取赶集网

爬取赶集网二手交易市场所有类目,并将信息储存在数据中。

效果是这样的:

python爬取赶集网_第1张图片
所有类目访问链接
python爬取赶集网_第2张图片
产品详情信息

我的代码:

#建立channel_list.py文件获取所有类目的访问链接
import requests
from bs4 import BeautifulSoup
start_url='http://bj.ganji.com/wu/'
def get_channel_list(url):
    web_data=requests.get(url)
    soup=BeautifulSoup(web_data.text,'lxml')
    channels=soup.select('dl.fenlei dt a')
    # print(channels)#返回的是列表
    for channel in channels:
        base_url = 'http://bj.ganji.com'
        residue_url=channel.get('href')
        full_url=base_url+residue_url
        print(full_url)
get_channel_list(start_url)

channel_list='''
http://bj.ganji.com/jiaju/
http://bj.ganji.com/rirongbaihuo/
http://bj.ganji.com/shouji/
http://bj.ganji.com/shoujihaoma/
http://bj.ganji.com/bangong/
http://bj.ganji.com/nongyongpin/
http://bj.ganji.com/jiadian/
http://bj.ganji.com/ershoubijibendiannao/
http://bj.ganji.com/ruanjiantushu/
http://bj.ganji.com/yingyouyunfu/
http://bj.ganji.com/diannao/
http://bj.ganji.com/xianzhilipin/
http://bj.ganji.com/fushixiaobaxuemao/
http://bj.ganji.com/meironghuazhuang/
http://bj.ganji.com/shuma/
http://bj.ganji.com/laonianyongpin/
http://bj.ganji.com/xuniwupin/
http://bj.ganji.com/qitawupin/
http://bj.ganji.com/ershoufree/
http://bj.ganji.com/wupinjiaohuan/
'''
#建立link_list_detail_info.py文件获取每个类目的所有链接存放入数据库'linklists'及将每个类目的具体产品信息存放在'detailinfo'
import requests
from bs4 import BeautifulSoup
import time
from pymongo import MongoClient
import random
client=MongoClient('localhost',27017)
ganjiDB=client['ganjiDB']
linklists=ganjiDB['linklists']
detailinfo=ganjiDB['detailinfo']
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'}
proxy_list=[    
    'http://125.88.74.122:83',
    'http://113.18.193.5:8080',
    'http://113.18.193.7:8080',
    'http://120.92.3.127:90'
    ]
proxy_ip=random.choice(proxy_list)
proxies={'http':proxy_ip}#启用代理,规避赶集网针对单个IP的访问限制

def page_link(channel):
    for cate in range(1,3):
        for page in range(1,101):
            link_url = ['{}a{}o{}'.format(channel, cate, page)][0]
            #print(link_url)
            link_list(link_url)

def link_list(url):
    time.sleep(2)
    web_data=requests.get(url,headers=headers)
    # print(web_data.status_code)#返回结果code 200
    soup=BeautifulSoup(web_data.text,'lxml')
    # mark=soup.find('a','next')#返回结果为字符串
    # print(mark)
    if soup.find('a','next')and url.split('/')[-1][1]=='1':#满足两个条件1、当前页不是最后一页2、当前页属于个人类目
        lists=soup.select('td.t a.t')#与商家类目过滤条件不同
        # print(lists)
        for list in lists:
            list_href=list.get('href').split('?')[0]
            linklists.insert_one({'list_href':list_href})
            print(list_href)
    elif soup.find('a', 'next') and url.split('/')[-1][1] == '2':#满足两个条件1、当前页不是最后一页2、当前页属于商家类目
        lists = soup.select('a.ft-tit')#与个人列木过滤条件不同
        # print(lists)
        for list in lists:
            list_href = list.get('href')
            linklists.insert_one({'list_href': list_href})
            print(list_href)
    else:
        print('列表地址错误')
#获取每个页面的具体信息
def get_detail_info(url):
    web_data=requests.get(url,headers=headers)
    soup=BeautifulSoup(web_data.text,'lxml')
    if url[-5]=='x':
        info={
        'title':soup.select('h1.title-name')[0].text,
        'date':soup.select('i.pr-5')[0].text.strip(),
        'types':soup.select('ul > li > span > a')[5].text,
        'price':soup.select('i.f22.fc-orange.f-type')[0].text,
        'area':list(map(lambda x:x.text,soup.select('div > div > div > div > ul > li > a')[-3:-1])),
        'url':url
        }
        detailinfo.insert_one(info)
        print(info)
    elif url[-7]=='z':
        info={
        'title':soup.select('h1.info_titile')[0].text,
        'price':soup.select('span.price_now i')[0].text,
        'area':soup.select('div.palce_li span i')[0].text,
        'url':url
        }
        detailinfo.insert_one(info)
        print(info)
    else:
        print('地址错误')
#建立main.py文件调用channel_list.py、link_list_detail_info.py中的属性和方法及数据库信息
from channel_list import channel_list
from link_list_detail_info import linklists,page_link,link_list
from link_list_detail_info import detailinfo,get_detail_info
from multiprocessing import Pool
import time
def get_all_links(channel):
    page_link(channel)
db_urls=set([item['list_href'] for item in linklists.find()])
index_urls=set([item['url'] for item in detailinfo.find()])
rest_of_url=db_urls-index_urls#断点续传
if __name__=='__main__':
    pool=Pool()
    pool.map(get_all_links,channel_list.split())#调用channel_list
    time.sleep(10)
    pool.map(get_detail_info,rest_of_url)#调用rest_of_url中每个类目下具体页面链接获取页面详情并进行断点续传优化
#建立count.py文件实时监控存入linklists中链接数量
from link_list_detail_info import linklists
import time
while True:
    print(linklists.find().count())
    time.sleep(10)
监控截图:
python爬取赶集网_第3张图片
监控效果图

总结:

  • Mongodb数据的基础功能使用;
  • 多进程访问方式的引用;
  • 数据库查找的灵活调用实现断点续传;
  • map、lambda函数的使用;
  • proxy及headers防爬机制的使用。
python爬取赶集网_第4张图片
Paste_Image.png

你可能感兴趣的:(python爬取赶集网)