第二节练习项目:爬取商品信息
from bs4 import BeautifulSoup
import re # 引入正则表达式
# 本地静态网页路径
path = './index.html'
with open(path,'r') as wb_file:
wb_content=wb_file.read()
soup = BeautifulSoup(wb_content,'lxml')
pics=soup.select('body > div > div > div.col-md-9 > div > div > div > img')
titles=soup.select('body > div > div > div.col-md-9 > div > div > div > div.caption > h4 > a')
prices=soup.select('body > div > div > div.col-md-9 > div > div > div > div.caption > h4.pull-right')
stars=soup.select('body > div > div > div.col-md-9 > div > div > div > div.ratings > p:nth-of-type(2)')
reviews=soup.select('body > div > div > div.col-md-9 > div > div > div > div.ratings > p.pull-right')
for pic,title,price,star,review in zip(pics,titles,prices,stars,reviews):
data={
'pic':pic.get('src')
,'title':title.get_text()
,'price' :price.get_text()
, 'star': len(star.find_all('span','glyphicon-star'))
# 正则表达式解析数字
, 'review': int(re.search(r'(\d+)\s*.*',review.get_text()).group(1))
}
print(data)
- 学习了如何获取数组长度
- 了解基础的正则表达式知识
第三节练习项目:爬取租房信息
from bs4 import BeautifulSoup
import requests
import time
import re
def get_room_info(soup,data=None):
titles = soup.select('body > div.wrap.clearfix.con_bg > div.con_l > div.pho_info > h4 > em')
addresses = soup.select('body > div.wrap.clearfix.con_bg > div.con_l > div.pho_info > p > span.pr5')
prices =soup.select('#pricePart > div.day_l > span')
imgs = soup.select('#curBigImage')
fangdongs =soup.select('#floatRightBox > div.js_box.clearfix > div.member_pic > a > img')
genders = soup.select('#floatRightBox > div.js_box.clearfix > div.member_pic > div')
names =soup.select('#floatRightBox > div.js_box.clearfix > div.w_240 > h6 > a')
info=[]
for title,address,price,img,fangdong,gender,name in zip(titles,addresses,prices,imgs,fangdongs,genders,names):
data = {
'title': title.get_text()
, 'address': address.get_text().strip()
, 'price': price.get_text()
, 'img': img.get('src')
, 'fangdong': fangdong.get('src')
, 'gender': gender.get('class')
, 'name': name.get_text()
}
data['gender'] = '男' if 'member_ico' in data['gender'] else '女'
info.append(data)
return info[0]
# 是否可以作为种子网页
def is_seed(url):
if url == None : return False
for reg in [r'^http://[a-zA-Z0-9]+.xiaozhu.com/fangzi/\d+.html',r'^http://[a-zA-Z0-9]+.xiaozhu.com/?$'] :
if re.search(reg,url) != None : return True
return False
# 是否是租房信息的网页
def is_fangzi(url):
return url!=None and re.search(r'^http://.*.xiaozhu.com/fangzi/\d+.html', url) != None
# 爬取租房信息上限
max=300
# 爬取租房信息计数器
count=0
# 每次循环待抓去网页地址
seeds=['http://www.xiaozhu.com/']
# 已完成爬取的网页列表 防止重复爬取
completed_seeds=[]
# 已完成爬取的租房信息网页 防止重复爬取
completed_urls=[]
while count < max and len(seeds)>0: # 结束条件 爬取达到指定上限 或者 无待爬取页面
urls=[]
print('{} -> {}'.format(len(seeds),seeds))
for url in seeds :
completed_seeds.append(url)
wb_data = requests.get(url)
soup = BeautifulSoup(wb_data.text, 'lxml')
for a in soup.find_all('a'):
link = a.get('href')
if is_seed(link) and link not in urls:
urls.append(link)
if is_fangzi(url) and url not in completed_urls:
completed_urls.append(url)
count += 1
print('第{}次解析 -> {}'.format(count,url))
data = get_room_info(soup)
print('data -> {}'.format(data))
if count >= max :
break
time.sleep(2)
seeds.clear()
for url in urls:
if url not in completed_seeds :
seeds.append(url)
-- 种子页面的抓取要控制好条件,不是每个页面里面都有房子信息的链接
-- 爬取过的网页要记录下来,免得下次重复抓取
第四节练习项目:爬取霉霉图片
不知道为啥 网页的图片老是加载不了
第一周实战作业:爬取一页商品数据
from bs4 import BeautifulSoup
import requests
import time
import re
def get_product_info(url,data=None):
wb_data = requests.get(url)
soup = BeautifulSoup(wb_data.text, 'lxml')
cates = soup.select('#header > div.breadCrumb.f12 > span > a')
titles = soup.select('div.col_sub.mainTitle > h1')
times = soup.select('#index_show > ul.mtit_con_left.fl > li.time')
prices = soup.select('div.col_sub.sumary > ul > li:nth-of-type(1) > div.su_con > span')
finenesses = soup.select('div.col_sub.sumary > ul > li:nth-of-type(2) > div.su_con > span')
areas = soup.select('div.col_sub.sumary > ul > li:nth-of-type(3) > div.su_con > span')
data = {
'cate': cates[-1].get_text() if len(cates)>0 else None
, 'title': titles[0].get_text() if len(titles) >0 else None
, 'time': times[0].get_text() if len(times)==0 else None
, 'price': int(prices[0].get_text()) if len(prices)>=0 else None
, 'fineness': finenesses[0].get_text().strip() if len(finenesses)>0 else None
, 'area': ''.join(areas[0].stripped_strings) if len(areas) > 0 else None
, 'count': get_views_from(url)
}
# 获取浏览人数
return data
# 该方法参考了老师的代码
def get_views_from(url):
# url后面带有问号和参数,取问号前的url
url_path = url.split("?")[0]
# 取最后一节url信息,其带有id信息
url_last_part = url_path.split('/')[-1]
# 去掉后面的x.shtml,得到id
info_id = url_last_part.strip('x.shtml')
api = 'http://jst1.58.com/counter?infoid={}'.format(info_id)
# 这个是找到了58的查询接口,不了解接口可以参照一下新浪微博接口的介绍
# 浏览量的抓取做了反爬虫,因此加上header信息,不然返回为空
headers = {
'User-Agent':r'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36',
# 'Cookie':r'id58=c5/ns1ct99sKkWWeFSQCAg==; city=bj; 58home=bj; ipcity=yiwu%7C%u4E49%u4E4C%7C0; als=0; myfeet_tooltip=end; bj58_id58s="NTZBZ1Mrd3JmSDdENzQ4NA=="; sessionid=021b1d13-b32e-407d-a76f-924ec040579e; bangbigtip2=1; 58tj_uuid=0ed4f4ba-f709-4c42-8972-77708fcfc553; new_session=0; new_uv=1; utm_source=; spm=; init_refer=; final_history={}; bj58_new_session=0; bj58_init_refer=""; bj58_new_uv=1'.format(str(info_id)),
'Cookie':r'id58=05dvOVXZXj5KlGIZUAzjAg==; als=0; tj_ershounobiz=true; bj58_id58s="MUpHU0toSm1LdW5RODkyMA=="; sessionid=8876e87f-a6d3-475a-918e-e82609136139; 58home=nj; ipcity=nj%7C%u5357%u4EAC; myfeet_tooltip=end; __utma=253535702.1740136137.1464274125.1464274125.1464274125.1; __utmb=253535702.2.10.1464274125; __utmc=253535702; __utmz=253535702.1464274125.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); 58tj_uuid=2e47753f-8bb2-4d14-abbd-00a483f337da; new_session=0; new_uv=2; utm_source=; spm=; init_refer=; final_history={}; bj58_new_session=0; bj58_init_refer=""; bj58_new_uv=1'.format(str(info_id)),
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'zh-CN,zh;q=0.8',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Host':'jst1.58.com',
'Referer':url
}
r = requests.get(api, headers=headers)
# 判断状态码,检查是否被网站封ip
if r.status_code == 200:
return r.text.split('=')[-1]
return 0
seed = 'http://bj.58.com/pbdn/0/'
wb_data = requests.get(seed)
soup = BeautifulSoup(wb_data.text, 'lxml')
urls = []
# 首先解析列表页
for tag in soup.select('table > tr > td > a.t'):
if not tag.has_attr('onclick') and not tag.has_attr('data-addtype'):
urls.append(tag.get('href'))
# 遍历列表页 解析每个商品信息
for url in urls:
print('{} -> {}'.format(url,get_product_info(url)))
time.sleep(2)
- 学习了浏览量的抓取,开拓了眼界,平时要多观察。