爬小猪短租发布的房子信息

通过输入国内,国外,城市名只能抓取13页的信息。。。木鸟短租可以尝试抓取,主要里面有文章

import requests,re,time
from lxml import etree

#城市列表:
#https://www.cnblogs.com/114811yayi/p/7061674.html

#获取每个城市的url
headers={"User-Agent":"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"}
url = 'http://jci.xiaozhustatic1.com/e18122903/xzjs?k=Front_Index&httphost=www.xiaozhu.com'     #获取城市名称的链接
ser = input('输入你要查找的地区(1:国内;2:海外):')            #输入你你要查找是国内的短租房还是海外的短租房
html = requests.get(url).text       #通过上面提供的url来爬取每个省份的拼音

def choose_area():      #判断你需要查找的是国内的还是海外的,并输出相应的内容
	city_tup = re.compile('citys[[0-9]\d+]=new Array(.*?);').findall(html)
	for city_name in city_tup:
		city_time = re.compile('[0-9]\d*:[0-9]\d*').findall(city_name)
		if ser == '1':
			if len(city_time) == 0:
				city = re.compile('[\u4E00-\u9FA5]+').findall(city_name)[0]     #城市名称
				city_jc = re.compile('[a-z]\w*').findall(city_name)[1]      #城市拼音
				city_zf = re.compile('[0-9]\d*').findall(city_name)[0]      #城市租房数量
				city_dic = {city:[city_jc,city_zf]}
				yield city_dic
			else:
				pass
		elif ser == '2':
			if len(city_time) != 0:
				city = re.compile('[\u4E00-\u9FA5]+').findall(city_name)[0]
				city_jc = re.compile('[a-z]\w*').findall(city_name)[1]
				city_zf = re.compile('[0-9]\d*').findall(city_name)[0]
				city_dic = {city: [city_jc, city_zf]}
				yield city_dic
			else:
				pass

def get_url(city_jc,page):  #提供省份的名称和页码来构建需要爬取的url
	url = 'http://{}.xiaozhu.com/search-duanzufang-p{}-0/'.format(city_jc,page)
	return url

#根据你提供的省份名称来判断,这个省份有多少房源,但是每个省份的房源只显示13页的数据,所有做个判断,超过了的话就只显示13页,没有超过的话就有几页就显示几页
def url_list(city_name):
	# city_name = input('输入你要查找的城市名称:')
	for city in choose_area():
		if city_name in city.keys():
			if int(int(city[city_name][1])/24) > 13:
				for page in range(1,14):
					url = get_url(city[city_name][0],page)
					yield url
			elif int(int(city[city_name][1])/24) <= 13:
				sum_page = int(int(city[city_name][1])/24) <= 13
				for page in range(1,sum_page + 1):
					url = get_url(city[city_name][0], page)
					yield url


def get_links(url):
	wb_data = requests.get(url, headers=headers)
	href=etree.HTML(wb_data.content)
	href_list=href.xpath('//*[@id="page_list"]/ul/li')
	for info in href_list:
		link = info.xpath('a/@href')[0].strip()
		get_info(link)

def sex_is(class_name):
	if class_name == 'member_girl_ico':
		return '女'
	else:
		return '男'

def get_info(url):
	html_data = requests.get(url, headers=headers)
	selector = etree.HTML(html_data.text)
	item = {}
	item['title'] = selector.xpath('//div[@class="wrap clearfix con_bg"]/div[1]/div[1]/h4/em/text()')[0].strip()
	item['address'] = selector.xpath('//div[@class="pho_info"]/p/span/text()')[0].strip()
	item['price'] = selector.xpath('//*[@id="pricePart"]/div[1]/span/text()')[0].strip()
	item['img'] = selector.xpath('//*[@id="floatRightBox"]/div[3]/div[@class="member_pic"]/a/img/@src')[0].strip()
	item['sex'] = sex_is(selector.xpath('//*[@class="w_240"]/h6/span/@class')[0].strip())
	print(item)


if __name__ == '__main__':
	city = input('输入你想爬取的城市名称:')
	for url in url_list(city):  # 通过传入上海的,获取上海的短租房的所有页面url
		print(url)
		get_links(url)
		time.sleep(2)

 

你可能感兴趣的:(爬虫)