【爬虫篇】:爬取58同城商品信息

成果:

【爬虫篇】:爬取58同城商品信息_第1张图片
Paste_Image.png

代码:
import requests
from bs4 import BeautifulSoup
import itertools
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.75 Safari/537.36'}

0 是个人 1是商家

def get_url(choose):
each_phoneurl = []
list_phoneurl = 'http://gz.58.com/shouji/{}/'.format(str(choose))
phone = requests.get(list_phoneurl,headers=headers)
soup = BeautifulSoup(phone.text,'lxml')
link_phone = soup.select('tr td.t a.t')
for each_phone in link_phone:
each_phones = each_phone.get('href')
each_phoneurl.append(each_phones)
return each_phoneurl

get_url(0)

telephone_url = 'http://gz.58.com/shouji/25755367441717x.shtml?psid=173193533191818784206463731&entinfo=257t55367441717_0'

def get_pv(telephone_url):
number = telephone_url.split('/')[-1].split('x.')[0]
#split 表示分割及部分+[N]-表示切片部分,strip 去掉一部分[0]要说明要那一部分
headser = {'Referer':telephone_url}
#Referer是反爬虫的一个手段之一
post_url = 'http://jst1.58.com/counter?infoid={}'.format(str(number))
js = requests.get(post_url,headers=headser)
pv = js.text.split('=')[-1]
#print(pv)
return(pv)

get_pv(telephone_url)

def get_imformation(choose=0):

each_phoneurls =  get_url(choose)
for phone_url in each_phoneurls:
    try:  
        phone_each = requests.get(phone_url,headers=headers)
        
        toup = BeautifulSoup(phone_each.text,'lxml')
        leimu =  toup.select('#header > div.breadCrumb.f12 > span > a')[-1].get_text()#if  toup.select('#header > div.breadCrumb.f12 > span > a') else None
        biaoti = toup.select('div.col_sub.mainTitle > h1')[0].get_text()  
        release_time = toup.select('#index_show > ul.mtit_con_left.fl > li.time')[0].get_text() 
        price = toup.select('span.price.c_f50')[0].get_text() 
        place = list(toup.select('div.su_con > span.c_25d')[-1].stripped_strings) 
        #view 加载的时候浏览量由0变为123,由JS控制
        #.stripped_strings 输出的字符串中可能包含了很多空格或空行,使用 .stripped_strings 可以去除多余空白内容,这个时候会出现迭代器,所以要list()

    
        data = {
        '类目':leimu,
        '标题':biaoti,
        '价格':price,
        '地址':"".join(itertools.chain(*place)),#"".join(itertools.chain(*list))把列表中所有的元素链接起来
        '发布时间':release_time,
        '浏览量':get_pv(phone_url)
        }
    
      
        print(data)
    except:
        pass

get_imformation(0)

'''
过滤方法

for link in soup.select('td.t > a[href^="http://bj.58.com/"]'): # 利用正则表达式删选自己需要网页的url
urls.append(link.get('href'))
return urls

  1. for title in titles:
    if title.get('data-addtype') == None and title.get('onclick') == None:
    filters.append(title)
    '''

你可能感兴趣的:(【爬虫篇】:爬取58同城商品信息)