When we deal with large data from internet, it's necessary to design a efficient workflow with multi-spider.

Example

Spider1

We made our first spider to crawl those links we concerned. And save them into our database.

Spider2

We use another spider to get every single page information, and save those info into another database. Spider2 is like a worker follow some easier directions. All it has to know is just URL of the page.

Advantage of this style

By separate the work into different single worker. It makes our program more stable, and clear.
Anyway, it's principle is "** only do one job a time** ".

Target

Get all the second-hand phone Number selling in 58.com.
Save this info into database by pymongo.
Must design and use 2 spiders in this project.

As usual, here is my utility functions.

import requests
from bs4 import BeautifulSoup
import pymongo
import time

"http://gz.58.com/shoujihao/pn1/"
start_url = 'http://gz.58.com/shoujihao/'
client = pymongo.MongoClient('localhost', 27017)
db = client['testDB'] # 专门用于存放作业
Linksheet = db['58phoneLinks']
ItemSheet = db['58phoneInfos']
endPage = False

def getLinks(channel, page):
    '''获得链接信息,存入数据库中'''
    global endPage

    list_url = '{}/pn{}'.format(channel, str(page))
    print(list_url)
    wb_data = requests.get(list_url)
    soup = BeautifulSoup(wb_data.text, 'lxml')
    titles = soup.select('strong.number')
    links = soup.select('li > a.t')

    if len(soup.select("div.boxlist > ul > div.boxlist"))!=0:
        for title, link in zip(titles, links):
            linkUrl = link.get('href').split('?')[0]
            print(title.string, linkUrl)
            Linksheet.insert_one({
                'title':title.string,
                'url':linkUrl
            })
    else:
        print("this page is end")  # 这页没有找到数据,应该是末页了
        endPage = True
        pass

def getItemInfo(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'lxml')
    # print(soup.prettify())
    no_longer_exist = '404' in soup.find('script',type="text/javascript").get('src').split("/")
    if no_longer_exist:
        print("item is saled")
        pass
    else:
        title = soup.title.text
        price = soup.select('span.price')[0].get_text().strip()
        data = soup.select('li.time')[0].text.strip()
        location = list(soup.select('.c_25d')[0].stripped_strings) if soup.find_all('span', 'c_25d') else None
        # print(title, price, data, location)
        ItemSheet.insert_one({
            'title': title,
            'price': price,
            'data': data,
            'area': location
        })

Usage of these code

# Step1: Get all links with spider1
pageNumber = 1
while not endPage:
    getLinks(start_url, page=pageNumber)
    time.sleep(1)
    pageNumber += 1

# Step2: Crawl every page with spider2
for itemUrl in Linksheet.find():
    url = itemUrl['url']
    if url.lower().find('jump') > 0:
        print(itemUrl)
    else:
        print(url)
        time.sleep(0.5)
        getItemInfo(url)

Week2 hw2: Mass-Crawling's Workflow