Python实战计划学习笔记(9)为大规模爬取准备

主要工作

准备两个爬虫程序channel_extract.py,page_parsing.py

  • channel_extract.py的功能是爬取所有二级分类的URL
  • page_parsing.py的功能是爬取详情页中一类列表中所有商品的主要信息并入库

channel_extract.py代码

from bs4 import BeautifulSoup
import requests

start_url = 'http://bj.58.com/sale.shtml'
url_host = 'http://bj.58.com'

def get_channel_urls(url):
    web_data = requests.get(url)
    soup = BeautifulSoup(web_data.text,'lxml')
    links = soup.select('ul.ym-submnu > li > b > a')
    for link in links:
        page_url = url_host + link.get('href')
        print(page_url)

get_channel_urls(start_url)

page_parsing.py 代码

from bs4 import BeautifulSoup
import requests
import time
import pymongo

client = pymongo.MongoClient('localhost',27017)
test_58 = client['test_58']
url_list = test_58['url_list3']
item_info = test_58['item_info']

#spider 1
def get_links_from(channel,pages,who_sells=0):
    list_view = '{}{}/pn{}'.format(channel,str(who_sells),str(pages))
    web_data = requests.get(list_view)
    time.sleep(1)
    soup = BeautifulSoup(web_data.text,'lxml')
    if soup.find('td','t'):
        for link in soup.select('td.t a.t'):
            item_link = link.get('href').split('?')[0]
            url_list.insert_one({'url':item_link})
            print(item_link)
    else:
        pass

def get_item_info(url):
    web_data = requests.get(url)
    soup = BeautifulSoup(web_data.text,'lxml')
    no_longer_exist = soup.find('span','soldout_btn')
    if no_longer_exist:
        pass
    else:
        title = soup.title.text.strip()
        price = soup.select('span.price_now i')[0].text
        viewed = soup.select('span.look_time')[0].text.split('次')[0]
        area = list(soup.select('div.palce_li > span > i')[0].stripped_strings) if soup.find_all('div','palce_li') else None
        item_info.insert_one({'title':title,'price':price,'viewed':viewed,'area':area})
        print({'title':title,'price':price,'viewed':viewed,'area':area})



get_item_info('http://zhuanzhuan.58.com/detail/756768837813551105z.shtml')

get_links_from('http://bj.58.com/shuma/',2)

#http://zhuanzhuan.58.com/detail/756768837813551105z.shtml

补充作业

程序中断后,为确保数据不重复,需要在get_links_from()增加一个判断,判断抓取的url是否已经在数据库中,如果在,就pass。
涉及的代码如下:

            item_link = link.get('href').split('?')[0]
            if url_list.find_one({'url':item_link}):
                print('url exists')
                pass
            else:
                url_list.insert_one({'url':item_link})
                print(item_link)

你可能感兴趣的:(Python实战计划学习笔记(9)为大规模爬取准备)