1212完成爬去58

总结,用了 8300 多秒,终于获得了166035 条数据 。

成果展示

1212完成爬去58_第1张图片
pa58dedao.png

代码 几个文件粘在一起了

#!/usr/bin/python3
# coding=utf-8
# jerryLuan
#filename=channel_extact.py
from bs4 import BeautifulSoup
import requests


start_url = 'http://bj.58.com/sale.shtml'
url_host = 'http://bj.58.com'

def get_index_url(url):
    # url = start_url
    wb_data = requests.get(url)
    soup = BeautifulSoup(wb_data.text, 'lxml')
    links = soup.select('ul.ym-submnu > li > b > a')  #test pass 20161211
    for link in links:
        page_url = url_host + link.get('href')
        print(page_url)

get_index_url(start_url)
#http://bj.58.com/shoujihao/  I donot want
#http://bj.58.com/tiaozao/

channel_list = '''
    http://bj.58.com/shouji/
    http://bj.58.com/tongxunyw/
    http://bj.58.com/danche/
    http://bj.58.com/fzixingche/
    http://bj.58.com/diandongche/
    http://bj.58.com/sanlunche/
    http://bj.58.com/peijianzhuangbei/
    http://bj.58.com/diannao/
    http://bj.58.com/bijiben/
    http://bj.58.com/pbdn/
    http://bj.58.com/diannaopeijian/
    http://bj.58.com/zhoubianshebei/
    http://bj.58.com/shuma/
    http://bj.58.com/shumaxiangji/
    http://bj.58.com/mpsanmpsi/
    http://bj.58.com/youxiji/
    http://bj.58.com/jiadian/
    http://bj.58.com/dianshiji/
    http://bj.58.com/ershoukongtiao/
    http://bj.58.com/xiyiji/
    http://bj.58.com/bingxiang/
    http://bj.58.com/binggui/
    http://bj.58.com/chuang/
    http://bj.58.com/ershoujiaju/
    http://bj.58.com/bangongshebei/
    http://bj.58.com/diannaohaocai/
    http://bj.58.com/bangongjiaju/
    http://bj.58.com/ershoushebei/
    http://bj.58.com/yingyou/
    http://bj.58.com/yingeryongpin/
    http://bj.58.com/muyingweiyang/
    http://bj.58.com/muyingtongchuang/
    http://bj.58.com/yunfuyongpin/
    http://bj.58.com/fushi/
    http://bj.58.com/nanzhuang/
    http://bj.58.com/fsxiemao/
    http://bj.58.com/xiangbao/
    http://bj.58.com/meirong/
    http://bj.58.com/yishu/
    http://bj.58.com/shufahuihua/
    http://bj.58.com/zhubaoshipin/
    http://bj.58.com/yuqi/
    http://bj.58.com/tushu/
    http://bj.58.com/tushubook/
    http://bj.58.com/wenti/
    http://bj.58.com/yundongfushi/
    http://bj.58.com/jianshenqixie/
    http://bj.58.com/huju/
    http://bj.58.com/qiulei/
    http://bj.58.com/yueqi/
    http://bj.58.com/chengren/
    http://bj.58.com/nvyongpin/
    http://bj.58.com/qinglvqingqu/
    http://bj.58.com/qingquneiyi/
    http://bj.58.com/chengren/
    http://bj.58.com/xiaoyuan/
    http://bj.58.com/ershouqiugou/
'''
---
#!/usr/bin/python3
# coding=utf-8
#jerryLuan
#filename=counts.py
import time
from pages_parsing import url_list

i=0



while True:
    i+=1
    print(str(i*5) + "s ===> " + str(url_list.find().count()))
    time.sleep(5)


'''

while True:
    i += 1
    print(str(i)+ "kkkk"+ str(i*2))
    if i>22:
        break

'''
-----
#!/usr/bin/python3
# coding=utf-8
#jerryLuan
#filename=main.py
from multiprocessing import Pool
from channel_extact  import channel_list
from pages_parsing   import get_links_from


def get_all_links_from(channel):
    for i in range(1,111):
        get_links_from(channel,i)


if __name__ == '__main__':
    pool = Pool()
    # pool = Pool(processes=6)
    pool.map(get_all_links_from,channel_list.split())
-----
#!/usr/bin/python3
# coding=utf-8
#jerryLuan
#filename=pageers_parsing.py

from bs4 import BeautifulSoup
import requests
import time
import pymongo

client = pymongo.MongoClient('localhost', 27017)
ceshi = client['ceshi']
url_list = ceshi['url_list4']
item_info = ceshi['item_info4']


# 在最左边是在python 中对象的名称,后面的是在数据库中的名称
# spider 1
def get_links_from(channel, pages, who_sells=0):
    # td.t 没有这个就终止
    list_view = '{}{}/pn{}/'.format(channel, str(who_sells), str(pages))
    wb_data = requests.get(list_view)
    time.sleep(1)
    soup = BeautifulSoup(wb_data.text, 'lxml')
    if soup.find('td', 't'):
        for link in soup.select('td.t a.t'):
            item_link = link.get('href').split('?')[0]
            url_list.insert_one({'url': item_link})
            print(item_link)
            # return urls
    else:
        # It's the last page !
        pass

# spider 2
def get_item_info(url):
    wb_data = requests.get(url)
    soup = BeautifulSoup(wb_data.text, 'lxml')
    no_longer_exist = '404' in soup.find('script', type="text/javascript").get('src').split('/')
    if no_longer_exist:
        pass
    else:
        title = soup.title.text
        price = soup.select('span.price.c_f50')[0].text
        date = soup.select('.time')[0].text
        area = list(soup.select('.c_25d a')[0].stripped_strings) if soup.find_all('span', 'c_25d') else None
        item_info.insert_one({'title': title, 'price': price, 'date': date, 'area': area, 'url': url})
        print({'title': title, 'price': price, 'date': date, 'area': area, 'url': url})



你可能感兴趣的:(1212完成爬去58)