第二周大作业

第二周的大作业,交的晚了,周末一直在外地出差,刚回来,感谢老师指点。

  • 获取频道链接
from bs4 import BeautifulSoupimport requests
import requests
start_url = 'http://bj.ganji.com/wu/#all_category'
url_host = 'http://bj.ganji.com'
def get_channel_urls(start_url):    
        wb_data = requests.get(start_url)    
        soup = BeautifulSoup(wb_data.text,'lxml')    
        links = soup.select('.sider dd a')    
        for link in links:        
            page_url = url_host + link.get('href')        
            print(page_url)get_channel_urls(start_url)
channel_list ='''    
http://bj.ganji.com/chuangdian/    
http://bj.ganji.com/guizi/    
http://bj.ganji.com/zhuoyi/    
http://bj.ganji.com/shafachaji/    
http://bj.ganji.com/zixingchemaimai/    
http://bj.ganji.com/diandongche/    
http://bj.ganji.com/motuoche/   
 http://bj.ganji.com/iphone/    
http://bj.ganji.com/nokia/    
http://bj.ganji.com/htc/    
http://bj.ganji.com/sanxingshouji/    
http://bj.ganji.com/motorola/    
http://bj.ganji.com/shouji/_%E5%B0%8F%E7%B1%B3/    http://bj.ganji.com/shouji/_%E9%AD%85%E6%97%8F/    
http://bj.ganji.com/tongxuntaocan/    
http://bj.ganji.com/qqhao/   
 http://bj.ganji.com/bangongjiaju/    
http://bj.ganji.com/jiguangyitiji/    
http://bj.ganji.com/dayinji/z1/    
http://bj.ganji.com/shipinjiagongshebei/    
http://bj.ganji.com/shengchanjiamengshebei/    
http://bj.ganji.com/jichuang/    
http://bj.ganji.com/tuolaji/    
http://bj.ganji.com/dianshi/    
http://bj.ganji.com/bingxiang/    
http://bj.ganji.com/kongtiao/   
 http://bj.ganji.com/reshuiqi/    
http://bj.ganji.com/xiyiji/    
http://bj.ganji.com/diancilu/    
http://bj.ganji.com/weibolu/   
 http://bj.ganji.com/yueqiyinxiang/    
http://bj.ganji.com/pingbandiannao/z1/    http://bj.ganji.com/ershoubijibendiannao/z1/_%E8%8B%B9%E6%9E%9C/    http://bj.ganji.com/ershoubijibendiannao/z1/_%E8%81%94%E6%83%B3/    http://bj.ganji.com/ershoubijibendiannao/z1/_Thinkpad/    http://bj.ganji.com/ershoubijibendiannao/z1/_%E7%B4%A2%E5%B0%BC/    http://bj.ganji.com/ershoubijibendiannao/z1/_%E6%88%B4%E5%B0%94/    http://bj.ganji.com/ershoubijibendiannao/z1/_%E5%8D%8E%E7%A1%95/    http://bj.ganji.com/ershoubijibendiannao/z1/_%E6%83%A0%E6%99%AE/    http://bj.ganji.com/yueqi/    
http://bj.ganji.com/yinxiang/    
http://bj.ganji.com/yundongqicai/    
http://bj.ganji.com/tongche/    
http://bj.ganji.com/tongzhuang/   
 http://bj.ganji.com/yingerche/    
http://bj.ganji.com/yingerchuang/z1/   
 http://bj.ganji.com/niaobushi/    
http://bj.ganji.com/wanju/    
http://bj.ganji.com/naifen/   
 http://bj.ganji.com/taishidiannaozhengji/    
http://bj.ganji.com/xianka/    
http://bj.ganji.com/cpu/    
http://bj.ganji.com/yingpan/    
http://bj.ganji.com/luyouqi/    
http://bj.ganji.com/3gwangka/    
http://bj.ganji.com/shoucangpin/    
http://bj.ganji.com/qitalipinzhuanrang/   
 http://bj.ganji.com/baojianpin/   
 http://bj.ganji.com/xiaofeika/    
http://bj.ganji.com/fushi/    
http://bj.ganji.com/xiangbao/    
http://bj.ganji.com/xuemao/    
http://bj.ganji.com/shoubiao/    
http://bj.ganji.com/shipin/   
 http://bj.ganji.com/huazhuangpin/    
http://bj.ganji.com/hufupin/    
http://bj.ganji.com/shumaxiangji/    
http://bj.ganji.com/shumashexiangji/   
 http://bj.ganji.com/youxiji/    
http://bj.ganji.com/suishenting/   
 http://bj.ganji.com/yidongcunchu/   
 http://bj.ganji.com/zibubaojian/z2/   
 http://bj.ganji.com/anmobaojian/z1/    
http://bj.ganji.com/bawanwujian/    
http://bj.ganji.com/zhuanqu_anjia/all/  
  http://bj.ganji.com/zhuanqu_jiaren/all/    
http://bj.ganji.com/zhuanqu_shenghuo/all/
'''
  • 赶集爬虫数据大作业
import time
import pymongo
import requests
from bs4 import BeautifulSoup
import random

client = pymongo.MongoClient('localhost',27017)
ganjiershou = client['ganjiershou']
ershou_list = ganjiershou['ershou_list']
ershou_info =ganjiershou['ershou_info']

headers = {    
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36 Core/1.47.516.400 QQBrowser/9.4.8142.400'
}
proxy_list = [   
          'http://218.202.111.10:80',   
         'http://183.60.6.172:8080',   
         'http://120.202.249.197:80',   
         ]
proxy_ip = random.choice(proxy_list)#随机获取代理IP
proxies = {'http': proxy_ip}

def get_links_from(channel,who_sells='o',pages=1):
        #http://bj.ganji.com/pingbandiannao/o3/
        list_view = '{}{}{}'.format(channel,who_sells,pages) #用fromat方法自动生成STR类型字符串
        wb_data = requests.get(list_view,headers=headers)
        soup = BeautifulSoup(wb_data.text,'lxml')
        if soup.find('ul','pageLink'):
                 links = soup.select('#wrapper > div.leftBox > div.layoutlist > dl > dd.feature > div > ul > li > a')
                for link in links:
                        data={
                                'link' : link.get('href')
                        }
                        print(data)
                        ershou_list.insert_one(data)
#或是写成:
                  #if soup.find('ul','pageLink'):
#                        for link in soup.select('#wrapper > div.leftBox > div.layoutlist > dl > dd.feature > div > ul > li > a')
#                        item_link = link.get('href')
#                        ershou_list.insert_one({'url':item_link})

        else:
              #it is the last page !
              pass 
get_links_from('http://bj.ganji.com/pingbandiannao/')

def get_item_info(url):    
     wb_data = requests.get(url,headers=headers)   
     if wb_data.status_code == 404:        
          pass    
     else:        
          soup = BeautifulSoup(wb_data.text,'lxml')        
          title = soup.title.text#.strip()        
          price = soup.select('.f22')[0].text        
          date = soup.select('.pr-5')[0].text.strip().split(' ')[0]        
          area = list(map(lambda x:x.text,soup.select('ul.det-infor > li:nth-of-type(3) > a')))                  
          url = url        
          print(title,price,date,area) 
          ershou_info.insert_one({'title':title,'price':price,'date':date,'area':area,'url':url})
get_item_info('http://bj.ganji.com/shuma/2199547955x.htm')
  • 主程序
from multiprocessing import Pool
from 获得频道链接列表 import channel_list
from 赶集爬虫大数据作业 import get_links_from,get_item_info,ershou_list,ershou_info
def get_all_links_from(channel):    
    for i in range(1,100):        
        get_links_from(channel,i)
db_urls = [item['link'] for item in ershou_list.find()]
index_urls = [item['url'] for item in ershou_info.find()]
x = set(db_urls)
y = set(index_urls)
c = x-y
print(len(c))
if __name__ == '__main__':    
    pool = Pool()    
    #pool = pool(processes=4)    
    pool.map(get_all_links_from,channel_list.split())    
    pool.map(get_item_info, c)    
    pool.close()    
    pool.join()

  • 监控数量
import time
from 赶集爬虫大数据作业 import ershou_list
while True:    
    print(ershou_list.find().count())    
    time.sleep(3)

作业终于复制完了,工作量这么大,等于又给代码复习了一遍!
请老师点评!

你可能感兴趣的:(第二周大作业)