# _*_ coding:utf-8 _*_
import requests
from bs4 import BeautifulSoup as bs
import re
import os
import gc
import time
headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"}
def get_info(url):
url_part=[]
area_part=[]
r=requests.get(url,headers=headers).content
soup=bs(r,'html.parser')
house_url=soup.find_all('a',attrs={'tongji_label':'listclick'})
title_part=re.findall('tongji_label="listclick">(.*?) ',str(house_url))
for t in house_url:
u=str(t.get('href'))
url_part.append(u)
house_price=soup.find_all('p',attrs={'class':"sum"})
price_part=re.findall('
(.*?)',str(house_price))
house_average_price=soup.find_all('p',attrs={'class':"unit"})
average_price_part=re.findall('>(.*?)元/㎡',str(house_average_price))
house_location=soup.find_all('p')
location_part=re.findall('
(.*?)',str(house_location))
for (k,v) in zip(price_part,average_price_part):
area=round(float(k)*10000/float(v))
area_part.append(str(area))
return title_part,price_part,average_price_part,area_part,location_part,url_part
del house_url
del house_price
del house_location
del house_average_price
#del title_part
#del price_part
#del average_price_part
#del area_part
#del location_part
#del url_part
gc.collect()
g=1
file=open('上海房价信息.txt','w')
while g<71:
url='http://sh.58.com/ershoufang/pn%d/' %(g)
print(g)
g=g+1
a,b,c,d,e,f=get_info(url)
for ur in f:
if len(ur)>121:
p=f.index(ur)
ur=ur[0:297]
f.pop(p)
f.insert(p,ur)
for (i,j,k,l,m,n) in zip(a,b,c,d,e,f):
file.write(i)
file.write('
')
file.write(j)
file.write('
')
file.write(k)
file.write('
')
file.write(l)
file.write('
')
print(m)
file.write(m)
file.write('
')
file.write(n)
file.write('\n')
file.close()