第二周的大作业,交的晚了,周末一直在外地出差,刚回来,感谢老师指点。
- 获取频道链接
from bs4 import BeautifulSoupimport requests
import requests
start_url = 'http://bj.ganji.com/wu/#all_category'
url_host = 'http://bj.ganji.com'
def get_channel_urls(start_url):
wb_data = requests.get(start_url)
soup = BeautifulSoup(wb_data.text,'lxml')
links = soup.select('.sider dd a')
for link in links:
page_url = url_host + link.get('href')
print(page_url)get_channel_urls(start_url)
channel_list ='''
http://bj.ganji.com/chuangdian/
http://bj.ganji.com/guizi/
http://bj.ganji.com/zhuoyi/
http://bj.ganji.com/shafachaji/
http://bj.ganji.com/zixingchemaimai/
http://bj.ganji.com/diandongche/
http://bj.ganji.com/motuoche/
http://bj.ganji.com/iphone/
http://bj.ganji.com/nokia/
http://bj.ganji.com/htc/
http://bj.ganji.com/sanxingshouji/
http://bj.ganji.com/motorola/
http://bj.ganji.com/shouji/_%E5%B0%8F%E7%B1%B3/ http://bj.ganji.com/shouji/_%E9%AD%85%E6%97%8F/
http://bj.ganji.com/tongxuntaocan/
http://bj.ganji.com/qqhao/
http://bj.ganji.com/bangongjiaju/
http://bj.ganji.com/jiguangyitiji/
http://bj.ganji.com/dayinji/z1/
http://bj.ganji.com/shipinjiagongshebei/
http://bj.ganji.com/shengchanjiamengshebei/
http://bj.ganji.com/jichuang/
http://bj.ganji.com/tuolaji/
http://bj.ganji.com/dianshi/
http://bj.ganji.com/bingxiang/
http://bj.ganji.com/kongtiao/
http://bj.ganji.com/reshuiqi/
http://bj.ganji.com/xiyiji/
http://bj.ganji.com/diancilu/
http://bj.ganji.com/weibolu/
http://bj.ganji.com/yueqiyinxiang/
http://bj.ganji.com/pingbandiannao/z1/ http://bj.ganji.com/ershoubijibendiannao/z1/_%E8%8B%B9%E6%9E%9C/ http://bj.ganji.com/ershoubijibendiannao/z1/_%E8%81%94%E6%83%B3/ http://bj.ganji.com/ershoubijibendiannao/z1/_Thinkpad/ http://bj.ganji.com/ershoubijibendiannao/z1/_%E7%B4%A2%E5%B0%BC/ http://bj.ganji.com/ershoubijibendiannao/z1/_%E6%88%B4%E5%B0%94/ http://bj.ganji.com/ershoubijibendiannao/z1/_%E5%8D%8E%E7%A1%95/ http://bj.ganji.com/ershoubijibendiannao/z1/_%E6%83%A0%E6%99%AE/ http://bj.ganji.com/yueqi/
http://bj.ganji.com/yinxiang/
http://bj.ganji.com/yundongqicai/
http://bj.ganji.com/tongche/
http://bj.ganji.com/tongzhuang/
http://bj.ganji.com/yingerche/
http://bj.ganji.com/yingerchuang/z1/
http://bj.ganji.com/niaobushi/
http://bj.ganji.com/wanju/
http://bj.ganji.com/naifen/
http://bj.ganji.com/taishidiannaozhengji/
http://bj.ganji.com/xianka/
http://bj.ganji.com/cpu/
http://bj.ganji.com/yingpan/
http://bj.ganji.com/luyouqi/
http://bj.ganji.com/3gwangka/
http://bj.ganji.com/shoucangpin/
http://bj.ganji.com/qitalipinzhuanrang/
http://bj.ganji.com/baojianpin/
http://bj.ganji.com/xiaofeika/
http://bj.ganji.com/fushi/
http://bj.ganji.com/xiangbao/
http://bj.ganji.com/xuemao/
http://bj.ganji.com/shoubiao/
http://bj.ganji.com/shipin/
http://bj.ganji.com/huazhuangpin/
http://bj.ganji.com/hufupin/
http://bj.ganji.com/shumaxiangji/
http://bj.ganji.com/shumashexiangji/
http://bj.ganji.com/youxiji/
http://bj.ganji.com/suishenting/
http://bj.ganji.com/yidongcunchu/
http://bj.ganji.com/zibubaojian/z2/
http://bj.ganji.com/anmobaojian/z1/
http://bj.ganji.com/bawanwujian/
http://bj.ganji.com/zhuanqu_anjia/all/
http://bj.ganji.com/zhuanqu_jiaren/all/
http://bj.ganji.com/zhuanqu_shenghuo/all/
'''
- 赶集爬虫数据大作业
import time
import pymongo
import requests
from bs4 import BeautifulSoup
import random
client = pymongo.MongoClient('localhost',27017)
ganjiershou = client['ganjiershou']
ershou_list = ganjiershou['ershou_list']
ershou_info =ganjiershou['ershou_info']
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36 Core/1.47.516.400 QQBrowser/9.4.8142.400'
}
proxy_list = [
'http://218.202.111.10:80',
'http://183.60.6.172:8080',
'http://120.202.249.197:80',
]
proxy_ip = random.choice(proxy_list)#随机获取代理IP
proxies = {'http': proxy_ip}
def get_links_from(channel,who_sells='o',pages=1):
#http://bj.ganji.com/pingbandiannao/o3/
list_view = '{}{}{}'.format(channel,who_sells,pages) #用fromat方法自动生成STR类型字符串
wb_data = requests.get(list_view,headers=headers)
soup = BeautifulSoup(wb_data.text,'lxml')
if soup.find('ul','pageLink'):
links = soup.select('#wrapper > div.leftBox > div.layoutlist > dl > dd.feature > div > ul > li > a')
for link in links:
data={
'link' : link.get('href')
}
print(data)
ershou_list.insert_one(data)
#或是写成:
#if soup.find('ul','pageLink'):
# for link in soup.select('#wrapper > div.leftBox > div.layoutlist > dl > dd.feature > div > ul > li > a')
# item_link = link.get('href')
# ershou_list.insert_one({'url':item_link})
else:
#it is the last page !
pass
get_links_from('http://bj.ganji.com/pingbandiannao/')
def get_item_info(url):
wb_data = requests.get(url,headers=headers)
if wb_data.status_code == 404:
pass
else:
soup = BeautifulSoup(wb_data.text,'lxml')
title = soup.title.text#.strip()
price = soup.select('.f22')[0].text
date = soup.select('.pr-5')[0].text.strip().split(' ')[0]
area = list(map(lambda x:x.text,soup.select('ul.det-infor > li:nth-of-type(3) > a')))
url = url
print(title,price,date,area)
ershou_info.insert_one({'title':title,'price':price,'date':date,'area':area,'url':url})
get_item_info('http://bj.ganji.com/shuma/2199547955x.htm')
- 主程序
from multiprocessing import Pool
from 获得频道链接列表 import channel_list
from 赶集爬虫大数据作业 import get_links_from,get_item_info,ershou_list,ershou_info
def get_all_links_from(channel):
for i in range(1,100):
get_links_from(channel,i)
db_urls = [item['link'] for item in ershou_list.find()]
index_urls = [item['url'] for item in ershou_info.find()]
x = set(db_urls)
y = set(index_urls)
c = x-y
print(len(c))
if __name__ == '__main__':
pool = Pool()
#pool = pool(processes=4)
pool.map(get_all_links_from,channel_list.split())
pool.map(get_item_info, c)
pool.close()
pool.join()
- 监控数量
import time
from 赶集爬虫大数据作业 import ershou_list
while True:
print(ershou_list.find().count())
time.sleep(3)
作业终于复制完了,工作量这么大,等于又给代码复习了一遍!
请老师点评!