import requests
import lxml.html
import time
from fake_useragent import UserAgent
import pymongo
import random
f = "https://bj.lianjia.com/ershoufang/"
ua = UserAgent()
client = pymongo.MongoClient()
database = client['lianjia']
collection = database['spider']
proxyHost = "http-dyn.abuyun.com"
proxyPort = "9020"
# 代理隧道验证信息
proxyUser = "xxxxx"
proxyPass = "xxxxx"
proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
"host": proxyHost,
"port": proxyPort,
"user": proxyUser,
"pass": proxyPass,
}
proxies = {
"http": proxyMeta,
"https": proxyMeta,
}
url_list = []
for i in range(1, 101):
right_url = 'https://bj.lianjia.com/ershoufang/pg{}/'.format(i)
url_list.append(right_url)
t = True
num = 0
while t:
try:
HEADERS = {'Referer': 'https://bj.lianjia.com/?utm_source=baidu&utm_medium=pinzhuan&utm_term=biaoti&utm_content=biaotimiaoshu&utm_campaign=sousuo&ljref=pc_sem_baidu_ppzq_x',
'User-Agent': ua.random}
print(url_list[num])
html = requests.get(url_list[num], headers=HEADERS, proxies=proxies).content.decode()
source = lxml.html.fromstring(html)
title = source.xpath('//*[@id="content"]/div[1]/ul/li/div[1]/div[2]/div/a/text()')
info = source.xpath('//*[@id="content"]/div[1]/ul/li/div[1]/div[2]/div/text()')
price = source.xpath('//*[@id="content"]/div[1]/ul/li/div[1]/div[6]/div[1]/span/text()')
per_price = source.xpath('/html/body/div[4]/div[1]/ul/li/div[1]/div[6]/div[2]/span/text()')
dic = {}
for i in range(len(title)):
dic = {'location': title[i], 'info': info[i], 'price': price[i], 'per_price': per_price[i]}
collection.insert_one(dic)
print(dic)
time.sleep(random.randint(3, 6))
num += 1
if num == 100:
t = False
except Exception as e:
num += 1
print(e)
continue
print('信息采集完毕')
采集这些网站最后花点钱买一个ip代理,这样不容易中途被封ip 我用的是 阿布云 1块钱一个小时 很划算
下面是采集的结果 部分数据
如果你正好有买房的想法,可以再采集全部成交二手房那个的信息,然后进行数据对比 这样就能知道你想要买的房子 大概市面价是多少。