Python实战计划学习笔记0702

实战计划第五天,抓了58同城。

最终成果是这样的:

Python实战计划学习笔记0702_第1张图片
Paste_Image.png

我的代码:

#!/usr/bin/env python    #告诉计算机执行程序在系统环境变量中的名字,详细位置在环境变量中设置好了
#-*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import time
import requests

def get_info(link):
    wb_detail = requests.get(link)
    soup = BeautifulSoup(wb_detail.text, 'lxml')
    # nth-of-child(3)改成nth-of-type(3)就可以唯一爬去本页面的信息了 conditions 和 areas爬出来后需要去掉特殊符号
    types = soup.select('#header > div.breadCrumb.f12 > span:nth-of-type(3) > a')
    titles = soup.select('#content > div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.mainTitle > h1')
    dates = soup.select('#index_show > ul.mtit_con_left.fl > li.time')
    prices = soup.select(
        '#content > div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.sumary > ul > li:nth-of-type(1) > div.su_con > span')
    conditions = soup.select(
        '#content > div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.sumary > ul > li:nth-of-type(2) > div.su_con > span')
    areas = soup.select(
        '#content > div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.sumary > ul > li:nth-of-type(3) > div.su_con > span')

    for type, title, date, price, condition, area in zip(types, titles, dates, prices, conditions, areas):
        data = {
            'type' : type.get_text(),
            'title' : title.get_text(),
            'data' : date.get_text(),
            'price' : price.get_text(),
            'conditions' : list(condition.stripped_strings),   #list()用法
            'area' : list(areas[0].stripped_strings) if soup.find_all('span', 'c_25d') else None,
            'view' : get_view(link)
        }
        print(data)

def get_view(url):  #获取浏览量
    infoid = url.split('?')[0].split('/')[-1].strip('x.shtml')
    api = 'http://jst1.58.com/counter?infoid={}'.format(infoid)

    '''这里要加上header信息'''

    headers = {'User-Agent':r'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36',
               'Cookie':r'id58=c5/ns1ct99sKkWWeFSQCAg==; city=bj; 58home=bj; ipcity=yiwu%7C%u4E49%u4E4C%7C0; als=0; myfeet_tooltip=end; bj58_id58s="NTZBZ1Mrd3JmSDdENzQ4NA=="; sessionid=021b1d13-b32e-407d-a76f-924ec040579e; bangbigtip2=1; 58tj_uuid=0ed4f4ba-f709-4c42-8972-77708fcfc553; new_session=0; new_uv=1; utm_source=; spm=; init_refer=; final_history={}; bj58_new_session=0; bj58_init_refer=""; bj58_new_uv=1'.format(str(infoid)),
               'Accept': '*/*',
               'Accept-Encoding': 'gzip, deflate, sdch',
               'Accept-Language': 'zh-CN,zh;q=0.8',
               'Cache-Control': 'max-age=0',
               'Connection': 'keep-alive',
               'Host':'jst1.58.com',
               'Referer':r'http://bj.58.com/pingbandiannao/{}x.shtml'.format(str(infoid))
               }
    js = requests.get(api,headers = headers)
    #js = requests.get(api)
    view = js.text.split('=')[-1]
    return view

def get_links_info(page):
    urls = ['http://bj.58.com/pbdn/1/pn{}'.format(str(i)) for i in range(1,page)]   #必须是个list
    for url in urls:
        wb_data = requests.get(url)
        soup = BeautifulSoup(wb_data.text,'lxml')
        schemes = soup.select('#infolist tr td.t a')   #为什么写成这样就可以爬取了??
        print(schemes)
        time.sleep(2)



        for scheme in schemes:
            link = scheme.get('href')
            if link[:17] == 'http://bj.58.com/':     #用这种select有效连接
                get_info(link)



get_links_info(20)

总结和问题

  • list()用法
  • CSSpath没有>
  • 字典写文件语句
  • open路径前面加r
  • 确定浏览量代码

你可能感兴趣的:(Python实战计划学习笔记0702)