58同城二手数据获取

from bs4 import BeautifulSoup
import requests
import time
import random

headers={
    'Cookie': 'id58=c5/njVsBjTGDm7Q3B9NdAg==; 58tj_uuid=bde7a202-df83-41d2-905d-71ff35ce3765; als=0; commontopbar_myfeet_tooltip=end; xxzl_deviceid=%2FnmpP%2Bih70ZUIq01xI%2BZcGSwUUEKDP1wm4LRdzkqXc0mZqIC57yfV8zUF%2BLbIgrM; myfeet_tooltip=end; __utma=253535702.743072477.1526828404.1526828404.1526828404.1; __utmz=253535702.1526828404.1.1.utmcsr=cn.58.com|utmccn=(referral)|utmcmd=referral|utmcct=/diannao/pve_5621_101_200/; wmda_uuid=23e5d7502ce3d3f37f605e9a7fb0e759; wmda_new_uuid=1; gr_user_id=96cf90df-4b57-45c9-96ef-69036abfcb8b; jr8_t_c_v1=jinrong58com.15269750975030.4137874494841842; _ga=GA1.2.743072477.1526828404; cookieuid=e68ad006-bf75-4d2a-9c8f-65bb7dbad9f9; mcity=boluo; mcityName=%E5%8D%9A%E7%BD%97; wmda_visited_projects=%3B1409632296065%3B4166008487938%3B1731916484865; Hm_lvt_e2d6b2d0ec536275bb1e37b421085803=1526975473,1527666065; ppStore_fingerprint=A54230BD08C0AEB798B461CE0A6614961984F84733E824FF%EF%BC%BF1527666066511; final_history=33841631607604%2C34028871407046%2C18213731714561%2C32481337257540%2C34128334339022; city=gz; 58home=gz; new_uv=15; es_ab=1; sessionid=87a8c939-1de2-4c53-b3c1-e2c4b2662d25',
    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36'}
url_list=[]
item_links=[]

def get_channel_urls(url):
    url_host="http://gz.58.com"
    wb_data=requests.get(url,headers=headers)
    soup=BeautifulSoup(wb_data.text,'lxml')
    links=soup.select("span.dlb > a")
    for link in links:
        home_page_url=url_host+str(link.get("href"))
        url_list.append(home_page_url)
    return(url_list)

def get_links_from(channel,page):
    list_url="{}/0/pn{}/".format(channel,str(page))
    wb_data=requests.get(list_url,headers=headers)
    time.sleep(2)
    soup=BeautifulSoup(wb_data.text,'lxml')
    if soup.find("td"):
        for link in soup.select("td.t > a"):
            item_links.append(link.get("href").split("?")[0])
            return(item_links)
    else:
        pass

def get_item_info(url):
    wb_data=requests.get(url,headers=headers)
    if wb_data.status_code==200:
        soup=BeautifulSoup(wb_data.text,'lxml')
    else:
        pass
    title=soup.select('div.box_left_top > h1')[0].text
    price=soup.select(".price_now > i")[0].text
    try:
        area=soup.select(".palce_li > span > i")[0].text
    except:
        pass
    # if soup.find("palce_li"):
    #     area=soup.select(".palce_li > span > i")
    # else:
    #     area=None
    data={
        "title":title,
        "price":price,
        "area":area
    }
    print(data)

if __name__ == '__main__':
    start_url="http://gz.58.com/sale.shtml"
    m='zhuanzhuan'
    get_channel_urls(start_url)
    for url in url_list:
        for page in range(1,11):
            get_links_from(url,page)
            for item_link in item_links:
                if m in item_link:
                    get_item_info(item_link)
                    time.sleep(1)

css路径表达式

p[attr] 包含什么属性
p[attr='value'] target为blank的元素
p[href^='subtring'] 选择所有href属性值以https开头的a元素
p[href$='.pdf'] 选择所有href属性值以.pdf结尾的a元素
p[href*='w3schools'] 选择所有href属性值包含w3schools的a元素
p[title~="flower"] 包含关系
css=button.attr:contains("OK") :contains是个Pseudo-class,用冒号开头,括号里是内容

 

你可能感兴趣的:(爬虫)