链家数据爬虫

参考 http://blog.csdn.net/leeafay/article/details/76167189

使用python库 BeautifulSoup 及 pandas

开发环境:pycharm

python版本:3.6.3

import urllib.request

from bs4 import BeautifulSoup

import pandas as pd

# beautifulsoup方法

## load html file

def get_content(url):

    html = urllib.request.urlopen(url)

    content = html.read().decode("utf-8") # 转码 'ignore'

    html.close()  # 一定要关闭网页

    return content

def save_to_file(file_name, contents):

    fh = open(file_name,'w')

    fh.write(contents)

    fh.close()

def get_txt(info):

    soup = BeautifulSoup(info,"lxml")  # 设置解析器为“lxml”

    #lianjia房价数据

    #月份

    # month =soup.select('.qushi-1')

    # smonth = str(month).strip('[

'+'月链家参考均价
]' ) 是

    # print(smonth)

    #挂牌均价

    average_price = soup.select('.qushi-2 > .num')

    saverage_price = str(average_price).strip('[' + ']')

    #print (saverage_price)

    #链家房源数

    total = soup.select('.txt' )

    #print(total)

    #在售房源

    stotal1 = str(total[1]).strip('在售房源'+'套')

    #print (stotal1)

    #最近90天成交房源数

    stotal2 = str(total[2]).strip('最近90天内成交房源'+'套')

    #print(stotal2)

    #昨日新增房

    add = soup.select('.num')

    sadd1 = str(add[1]).strip('

'+'
')

    #昨日新增客

    sadd2 = str(add[2]).strip('

' + '
')

    #昨日带看

    sadd3 = str(add[3]).strip('

' + '

    #print(sadd1,sadd2,sadd3)

    return saverage_price,stotal1,stotal2,sadd1,sadd2,sadd3

url ="https://hz.lianjia.com/fangjia/"

content = get_content(url)

c=get_txt(content)

df = pd.DataFrame()

df["average_price"] =([c[0]])

df['house num on sale'] =([c[1]])

df['recent 90 days'] =([c[2]]) #最近90天成交房源数

df['new house num last day'] =([c[3]])

df['new guest last day'] =([c[4]])

df['new visit last day'] = ([c[5]])

print(df)

df.to_csv('/Users/wzzhou/Desktop/test.csv')

你可能感兴趣的:(链家数据爬虫)