python爬取链家租房信息_python爬取链家租房信息

importrequests as rqfrom bs4 importBeautifulSoupimportjsonimporttimeimportpandas as pd

home_url= 'https://bj.lianjia.com/zufang'headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'}#首页

home_rt = rq.get(home_url, headers=headers).text

home_soup= BeautifulSoup(home_rt, 'lxml')#从首页获取到各个区域的入口链接

district_url_rt = home_soup.find_all('li', attrs={'class': 'filter__item--level2', 'data-type': 'district'})

district_urls=[]for i in range(1,len(district_url_rt)):

district_name= district_url_rt[i].a.string #区域名称

dis_url = district_url_rt[i].a.attrs['href']

dis_url= 'https://bj.lianjia.com' + dis_url #区域链接

district_urls.append([district_name, dis_url])print(district_urls)print('区域接口获取完毕')

finally_house_result=[]#遍历各个区域链接,分别从每个入口中获取到信息

for dis_url indistrict_urls:

time.sleep(5)

district_name= dis_url[0] + '区'district_url= dis_url[1]

district_rt= rq.get(district_url, headers=headers)

district_rt=district_rt.text

district_soup= BeautifulSoup(district_rt, 'lxml')

page_num= int(district_soup.find('div', attrs={'class': 'content__pg'}).attrs['data-totalpage']) #当前区域房屋信息 网页数

#遍历所有页,获取所有页 房屋标题+url

house_titurl =[]for page in range(1, page_num+1):

time.sleep(0.8)

page_url= district_url + f'/pg{page}' #当前页面链接

page_results = rq.get(page_url, headers=headers).text

page_soup=BeautifulSoup(page_results)

current_page_rts= page_soup.find_all('div', attrs={'class': 'content__list--item'}) #当前页面区域房屋信息列表

#遍历当前页面,获取 所有房屋 标题+ url

for houselist_rt incurrent_page_rts:

house_url= 'https://bj.lianjia.com' + houselist_rt.a['href'] #urs

house_title = houselist_rt.a.img['alt'] #标题

address_list = houselist_rt.div.find('p', attrs={'class': 'content__list--item--des'}).find_all('a')

address= address_list[1].string + '.' + address_list[2].string #地址

house_titurl.append([house_title, address, house_url])

district_num=len(house_titurl)print(f'{district_name}房屋标题&url获取完毕,共{district_num}套租房信息')#遍历当前区域所有的房屋标题+链接,获取房屋具体信息

for house_page inhouse_titurl:

time.sleep(0.6)

house_title= house_page[0] #房屋标题

address = house_page[1] #地址

house_url = house_page[2] #房屋链接

house_rt = rq.get(house_url, headers=headers).text

house_soup=BeautifulSoup(house_rt)

house_rt1= house_soup.find_all('li', attrs={'class': 'table_col'})

pay_method= house_rt1[5].string #支付方式

rent = house_rt1[6].string + house_rt1[1].find('span').string #房租

deposit = house_rt1[7].string + house_rt1[2].find('span').string #押金

service_fee = house_rt1[8].string + house_rt1[3].find('span').string #服务费

agency_fee = house_rt1[9].string + house_rt1[4].find('span').string #中介费

house_rt2= house_soup.find_all('li', attrs={'class': 'fl oneline'})

size= house_rt2[1].string[3:] #面积

toward = house_rt2[2].string[3:] #朝向

in_time = house_rt2[5].string[3:] #入住时间

rent_term = house_rt2[7].string[3:] #租期

storey = house_rt2[10].string[3:] #楼层

elevator = house_rt2[11].string[3:] #电梯

gas = house_rt2[17].string[3:] #燃气

#配套设施

supporting_facilities =[]for faci in range(21, len(house_rt2)):

supporting_facilities.append(house_soup.find_all('li', attrs={'class': 'fl oneline'})[faci].text.strip())

supporting_facilities= json.dumps(supporting_facilities, ensure_ascii=False)#中介信息

agency_names = house_soup.find_all('a', attrs={'class': 'name'})

agency_phones= house_soup.find_all('div', attrs={'class': 'phone'})

agency_scores= house_soup.find_all('div', attrs={'class': 'rate'})

agency_list=[]for name, phone, score inzip(agency_names, agency_phones, agency_scores):

agency_list.append({'中介姓名': name.string, '电话': phone.string, '评分': score.text.strip()})

agency_list= json.dumps(agency_list, ensure_ascii=False)

finally_house_result.append([district_name, address, house_title, size, toward, storey, elevator, gas, supporting_facilities, rent_term, in_time, rent, deposit, service_fee, agency_fee, agency_list])print(f'{district_name}房屋信息获取完毕,共{district_num}套')

data_num=len(finally_house_result)

columns= ['区域', '地址', '标题', '面积', '朝向', '楼层', '电梯', '燃气', '配套设施', '租期', '入住时间', '房租', '押金', '服务费', '中介费', '中介联系方式']

house_finally_dfdata= pd.DataFrame(finally_house_result, columns=columns)

house_finally_dfdata.to_excel('d:\\Desktop\\20191124链家北京各城区租房信息.xlsx')print(f'北京市各城区租房信息获取完毕,共{data_num}套')

你可能感兴趣的:(python爬取链家租房信息)