爬虫实战-北京链家,安居客二手房的爬取

链家mobie北京二手房5w多信息抓取,存MongoDB后进行数据可视化

import scrapy
from scrapy_redis.spiders import RedisCrawlSpider


# 57557套
class LianjiaSpider(RedisCrawlSpider):
    name = 'lianjia'
    allowed_domains = ['m.lianjia.com']
    #start_urls =['https://m.lianjia.com/bj/ershoufang/pg1/']
    redis_key = 'lianjia'

    def parse(self, response):
        house_list = response.xpath("//*[@class='mod_cont lazyload_ulog']/ul/li")
        for house in house_list[1:-1]:
            item = {}
            item['name'] = house.xpath('.//div[@class="item_main"]/text()').extract_first()
            item['detail'] = house.xpath('.//div[@class="item_other text_cut"]/text()').extract_first()
            item['price'] = house.xpath('string(.//*[@class="price_total"])').extract_first()
            item['unit_price'] = house.xpath('string(.//*[@class="unit_price"])').extract_first()
            item['href'] = 'https://m.lianjia.com'+house.xpath('./a/@href').extract_first()
            #print(item)
            yield item

        for i in range(2, 1918):
            url = 'https://m.lianjia.com/bj/ershoufang/pg{}/'.format(i)
            yield scrapy.Request(url, callback=self.parse)

 

安居客二手房:

import requests,time,random
from bs4 import BeautifulSoup

headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36'}


for i in range(1,30):
	url='https://xuzhou.anjuke.com/sale/p{}/'.format(i)
	print(i)
	#print(url)

	respomse=requests.get(url,headers=headers)
	time.sleep(random.randint(0,5))  #防止反爬
	#print(respomse.text)
	soup=BeautifulSoup(respomse.text,'lxml')

	house_list = soup.find_all('li', class_="list-item")
	for house in house_list:
		title=house.find('div',class_='house-title').a.text.strip()
		price=house.find('span',class_='price-det').text.strip()

        #带增加。。。

		print('title:'+title+  '  price:  '+price)

 

你可能感兴趣的:(爬虫)