Java异常与调优一站式解决方案

!/usr/bin/python

from bs4 import BeautifulSoup
import requests
def getHouseList(url):

house =[]
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER'}
#get从网页获取信息
res = requests.get(url,headers=headers)
#解析内容
soup = BeautifulSoup(res.content,'lxml')
#房源title
housename_divs = soup.find_all('div',class_='title')
for housename_div in housename_divs:
    housename_as=housename_div.find_all('a')
    for housename_a in housename_as:
        housename=[]
        #标题
        housename.append(housename_a.get_text())
        #超链接
        housename.append(housename_a['href'])
        house.append(housename)
huseinfo_divs = soup.find_all('div',class_='houseInfo')
for i in range(len(huseinfo_divs)):
    info = huseinfo_divs[i].get_text()
    infos = info.split('|')
    #小区称号
    house[i].append(infos[0])
    #户型
    house[i].append(infos[1])
    #平米
    house[i].append(infos[2])
#查询总价
house_prices = soup.find_all('div',class_='totalPrice')
for i in range(len(house_prices)):
    #价钱
    price = house_prices[i].get_text()
    house[i].append(price)
return house

爬取房屋细致信息:所在区域、套内面积

def houseinfo(url):

headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER'}
res = requests.get(url,headers=headers)
soup = BeautifulSoup(res.content,'lxml')
msg =[]
#所在区域
areainfos = soup.find_all('span',class_='info')
for areainfo in areainfos:
    #只需求获取第一个a标签的内容即可
    area = areainfo.find('a')
    if(not area):
        continue
    hrefStr = area['href']
    if(hrefStr.startswith('javascript')):
        continue
    msg.append(area.get_text())
    break
#依据房屋户型计算套内面积
infolist = soup.find_all('div',id='infoList')
num = []
for info in infolist:
    cols = info.find_all('div',class_='col')
    for i in cols:
        pingmi = i.get_text()
        try:
            a = float(pingmi[:-2])
            num.append(a)
        except ValueError:
            continue
msg.append(sum(num))
return msg

将房源信息写入txt文件

def writeFile(houseinfo):

f = open('d:/房源.txt','a',encoding='utf8')
# houseinfo.join('\n')
f.write(houseinfo+'\n')
f.close()

主函数

def main():

for i in range(1,100):
    print('-----分隔符',i,'-------')
    if i==1:
        url ='https://sjz.lianjia.com/ershoufang/hy1f2f5sf1l3l2l4a2a3a4/'
    else:
        url='https://sjz.lianjia.com/ershoufang/pg'+str(i)+'hy1f2f5sf1l3l2l4a2a3a4/'
    houses =getHouseList(url)
    for house in houses:
        link = house[1]
        if(not link.startswith('http')):
            continue
        mianji = houseinfo(link)
        #将套内面积、所在区域增加到房源信息
        house.extend(mianji)
        print(house)
        info = " ".join([str(x) for x in house])
        writeFile(info)

if name == '__main__':

main()

从链家网站查询到8849条房源信息,但是页面只能显现31(每页数量)*100(总页码)=3100条房源,其他没找到。

第二版:

获取某个小区的房源信息,并写入excel。

!/usr/bin/python

from bs4 import BeautifulSoup
import requests
import xlwt
def getHouseList(url):

house =[]
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER'}
#get从网页获取信息
res = requests.get(url,headers=headers)
#解析内容
soup = BeautifulSoup(res.content,'html.parser')
#房源title
housename_divs = soup.find_all('div',class_='title')
for housename_div in housename_divs:
    housename_as=housename_div.find_all('a')
    for housename_a in housename_as:
        housename=[]
        #标题
        housename.append(housename_a.get_text())
        #超链接
        housename.append(housename_a.get('href'))
        house.append(housename)
huseinfo_divs = soup.find_all('div',class_='houseInfo')
for i in range(len(huseinfo_divs)):
    info = huseinfo_divs[i].get_text()
    infos = info.split('|')
    #小区称号
    house[i].append(infos[0])
    #户型
    house[i].append(infos[1])
    #平米
    house[i].append(infos[2])
#查询总价
house_prices = soup.find_all('div',class_='totalPrice')
for i in range(len(house_prices)):
    #价钱
    price = house_prices[i].get_text()
    house[i].append(price)
return house

爬取房屋细致信息:所在区域、套内面积

def houseinfo(url):

headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER'}
res = requests.get(url,headers=headers)
soup = BeautifulSoup(res.content,'html.parser')
msg =[]
#所在区域
areainfos = soup.find_all('span',class_='info')
for areainfo in areainfos:
    #只需求获取第一个a标签的内容即可
    area = areainfo.find('a')
    if(not area):
        continue
    hrefStr = area['href']
    if(hrefStr.startswith('javascript')):
        continue
    msg.append(area.get_text())
    break
#依据房屋户型计算套内面积
infolist = soup.find_all('div',id='infoList')
num = []
for info in infolist:
    cols = info.find_all('div',class_='col')
    for i in cols:
        pingmi = i.get_text()
        try:
            a = float(pingmi[:-2])
            num.append(a)
        except ValueError:
            continue
msg.append(sum(num))
return msg

将房源信息写入excel文件

def writeExcel(excelPath,houses):

workbook = xlwt.Workbook()
#获取第一个sheet页
sheet = workbook.add_sheet('git')
row0=['标题','链接地址','户型','面积','朝向','总价','所属区域','套内面积']
for i in range(0,len(row0)):
    sheet.write(0,i,row0[i])
for i in range(0,len(houses)):
    house = houses[i]
    print(house)
    for j in range(0,len(house)):
        sheet.write(i+1,j,house[j])
workbook.save(excelPath)

主函数

def main():

data = []
for i in range(1,5):
    print('-----分隔符',i,'-------')
    if i==1:
        url ='https://sjz.lianjia.com/ershoufang/l2rs%E5%92%8C%E5%B9%B3%E4%B8%96%E5%AE%B6/'
    else:
        url='https://sjz.lianjia.com/ershoufang/pg'+str(i)+'l2rs%E5%92%8C%E5%B9%B3%E4%B8%96%E5%AE%B6/'
    houses =getHouseList(url)
    for house in houses:
        link = house[1]
        if(not link or not link.startswith('http')):
            continue
        mianji = houseinfo(link)
        #将套内面积、所在区域增加到房源信息
        house.extend(mianji)
    data.extend(houses)
writeExcel('d:/house.xls',data)

if name == '__main__':

你可能感兴趣的:(java)