Python实战计划学习第一周

第二节练习项目:爬取商品信息

from bs4 import BeautifulSoup
import re # 引入正则表达式

# 本地静态网页路径
path = './index.html'

with open(path,'r') as wb_file:
    wb_content=wb_file.read()
    soup =  BeautifulSoup(wb_content,'lxml')
    pics=soup.select('body > div > div > div.col-md-9 > div > div > div > img')
    titles=soup.select('body > div > div > div.col-md-9 > div > div > div > div.caption > h4 > a')
    prices=soup.select('body > div > div > div.col-md-9 > div > div > div > div.caption > h4.pull-right')
    stars=soup.select('body > div > div > div.col-md-9 > div > div > div > div.ratings > p:nth-of-type(2)')
    reviews=soup.select('body > div > div > div.col-md-9 > div > div > div > div.ratings > p.pull-right')

    for pic,title,price,star,review in zip(pics,titles,prices,stars,reviews):
        data={
            'pic':pic.get('src')
            ,'title':title.get_text()
            ,'price' :price.get_text()
            , 'star': len(star.find_all('span','glyphicon-star'))
            # 正则表达式解析数字
            , 'review': int(re.search(r'(\d+)\s*.*',review.get_text()).group(1))
        }
        print(data)
Python实战计划学习第一周_第1张图片
屏幕快照 2016-05-28 下午4.25.32.png
  • 学习了如何获取数组长度
  • 了解基础的正则表达式知识

第三节练习项目:爬取租房信息

from bs4 import BeautifulSoup
import requests
import time
import re

def get_room_info(soup,data=None):
    titles = soup.select('body > div.wrap.clearfix.con_bg > div.con_l > div.pho_info > h4 > em')
    addresses = soup.select('body > div.wrap.clearfix.con_bg > div.con_l > div.pho_info > p > span.pr5')
    prices =soup.select('#pricePart > div.day_l > span')
    imgs = soup.select('#curBigImage')
    fangdongs =soup.select('#floatRightBox > div.js_box.clearfix > div.member_pic > a > img')
    genders = soup.select('#floatRightBox > div.js_box.clearfix > div.member_pic > div')
    names =soup.select('#floatRightBox > div.js_box.clearfix > div.w_240 > h6 > a')
    info=[]
    for title,address,price,img,fangdong,gender,name in zip(titles,addresses,prices,imgs,fangdongs,genders,names):
        data = {
            'title': title.get_text()
            , 'address': address.get_text().strip()
            , 'price': price.get_text()
            , 'img': img.get('src')
            , 'fangdong': fangdong.get('src')
            , 'gender': gender.get('class')
            , 'name': name.get_text()
        }
        data['gender'] = '男' if 'member_ico' in data['gender'] else '女'
        info.append(data)
    return info[0]
# 是否可以作为种子网页
def is_seed(url):
    if url == None : return False
    for reg in [r'^http://[a-zA-Z0-9]+.xiaozhu.com/fangzi/\d+.html',r'^http://[a-zA-Z0-9]+.xiaozhu.com/?$'] :
        if re.search(reg,url) != None : return True
    return False
# 是否是租房信息的网页
def is_fangzi(url):
    return url!=None and re.search(r'^http://.*.xiaozhu.com/fangzi/\d+.html', url) != None
# 爬取租房信息上限
max=300
# 爬取租房信息计数器
count=0
# 每次循环待抓去网页地址
seeds=['http://www.xiaozhu.com/']
# 已完成爬取的网页列表 防止重复爬取
completed_seeds=[]
# 已完成爬取的租房信息网页 防止重复爬取
completed_urls=[]
while  count < max and len(seeds)>0: # 结束条件 爬取达到指定上限 或者 无待爬取页面
    urls=[]
    print('{} -> {}'.format(len(seeds),seeds))
    for url in seeds :
        completed_seeds.append(url)
        wb_data = requests.get(url)
        soup = BeautifulSoup(wb_data.text, 'lxml')
        for a in soup.find_all('a'):
            link = a.get('href')
            if is_seed(link) and link not in urls:
                urls.append(link)
        if is_fangzi(url) and url not in completed_urls:
            completed_urls.append(url)
            count += 1
            print('第{}次解析 -> {}'.format(count,url))
            data = get_room_info(soup)
            print('data -> {}'.format(data))
            if count >= max :
                break
        time.sleep(2)
    seeds.clear()
    for url in urls:
        if  url not in completed_seeds :
            seeds.append(url)
Python实战计划学习第一周_第2张图片
屏幕快照 2016-05-28 下午4.10.18.png

-- 种子页面的抓取要控制好条件,不是每个页面里面都有房子信息的链接
-- 爬取过的网页要记录下来,免得下次重复抓取

第四节练习项目:爬取霉霉图片

不知道为啥 网页的图片老是加载不了

Python实战计划学习第一周_第3张图片
屏幕快照 2016-05-28 下午4.20.49.png

第一周实战作业:爬取一页商品数据

from bs4 import BeautifulSoup
import requests
import time
import re

def get_product_info(url,data=None):
    wb_data = requests.get(url)
    soup = BeautifulSoup(wb_data.text, 'lxml')
    cates = soup.select('#header > div.breadCrumb.f12 > span > a')
    titles = soup.select('div.col_sub.mainTitle > h1')
    times = soup.select('#index_show > ul.mtit_con_left.fl > li.time')
    prices = soup.select('div.col_sub.sumary > ul > li:nth-of-type(1) > div.su_con > span')
    finenesses = soup.select('div.col_sub.sumary > ul > li:nth-of-type(2) > div.su_con > span')
    areas = soup.select('div.col_sub.sumary > ul > li:nth-of-type(3) > div.su_con > span')
    data = {
        'cate': cates[-1].get_text() if len(cates)>0 else None
        , 'title': titles[0].get_text() if len(titles) >0 else None
        , 'time': times[0].get_text() if len(times)==0 else None
        , 'price': int(prices[0].get_text()) if len(prices)>=0 else None
        , 'fineness': finenesses[0].get_text().strip() if len(finenesses)>0 else None
        , 'area': ''.join(areas[0].stripped_strings) if len(areas) > 0 else None
        , 'count': get_views_from(url)
    }
    # 获取浏览人数
    return data
# 该方法参考了老师的代码
def get_views_from(url):
    # url后面带有问号和参数,取问号前的url
    url_path = url.split("?")[0]
    # 取最后一节url信息,其带有id信息
    url_last_part = url_path.split('/')[-1]
    # 去掉后面的x.shtml,得到id
    info_id = url_last_part.strip('x.shtml')
    api = 'http://jst1.58.com/counter?infoid={}'.format(info_id)
    # 这个是找到了58的查询接口,不了解接口可以参照一下新浪微博接口的介绍
    # 浏览量的抓取做了反爬虫,因此加上header信息,不然返回为空
    headers = {
        'User-Agent':r'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36',
        # 'Cookie':r'id58=c5/ns1ct99sKkWWeFSQCAg==; city=bj; 58home=bj; ipcity=yiwu%7C%u4E49%u4E4C%7C0; als=0; myfeet_tooltip=end; bj58_id58s="NTZBZ1Mrd3JmSDdENzQ4NA=="; sessionid=021b1d13-b32e-407d-a76f-924ec040579e; bangbigtip2=1; 58tj_uuid=0ed4f4ba-f709-4c42-8972-77708fcfc553; new_session=0; new_uv=1; utm_source=; spm=; init_refer=; final_history={}; bj58_new_session=0; bj58_init_refer=""; bj58_new_uv=1'.format(str(info_id)),
        'Cookie':r'id58=05dvOVXZXj5KlGIZUAzjAg==; als=0; tj_ershounobiz=true; bj58_id58s="MUpHU0toSm1LdW5RODkyMA=="; sessionid=8876e87f-a6d3-475a-918e-e82609136139; 58home=nj; ipcity=nj%7C%u5357%u4EAC; myfeet_tooltip=end; __utma=253535702.1740136137.1464274125.1464274125.1464274125.1; __utmb=253535702.2.10.1464274125; __utmc=253535702; __utmz=253535702.1464274125.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); 58tj_uuid=2e47753f-8bb2-4d14-abbd-00a483f337da; new_session=0; new_uv=2; utm_source=; spm=; init_refer=; final_history={}; bj58_new_session=0; bj58_init_refer=""; bj58_new_uv=1'.format(str(info_id)),
        'Accept': '*/*',
        'Accept-Encoding': 'gzip, deflate, sdch',
        'Accept-Language': 'zh-CN,zh;q=0.8',
        'Cache-Control': 'max-age=0',
        'Connection': 'keep-alive',
        'Host':'jst1.58.com',
        'Referer':url
    }
    r = requests.get(api, headers=headers)
    # 判断状态码,检查是否被网站封ip
    if r.status_code == 200:
        return r.text.split('=')[-1]
    return 0

seed = 'http://bj.58.com/pbdn/0/'
wb_data = requests.get(seed)
soup = BeautifulSoup(wb_data.text, 'lxml')
urls = []
# 首先解析列表页
for tag in soup.select('table > tr > td > a.t'):
    if not tag.has_attr('onclick') and not tag.has_attr('data-addtype'):
        urls.append(tag.get('href'))
# 遍历列表页 解析每个商品信息
for url in urls:
    print('{} -> {}'.format(url,get_product_info(url)))
    time.sleep(2)
Python实战计划学习第一周_第4张图片
屏幕快照 2016-05-28 下午4.26.37.png
  • 学习了浏览量的抓取,开拓了眼界,平时要多观察。

你可能感兴趣的:(Python实战计划学习第一周)